87 files changed, 33393 insertions, 19121 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
index 271ca44..4679f74 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -15,6 +15,7 @@
 #ifndef TARGET_ARM_H
 #define TARGET_ARM_H
 
+#include "ARMBaseInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
@@ -25,97 +26,17 @@ class ARMBaseTargetMachine;
 class FunctionPass;
 class JITCodeEmitter;
 class formatted_raw_ostream;
+class MCCodeEmitter;
+class TargetAsmBackend;
+class MachineInstr;
+class ARMAsmPrinter;
+class MCInst;
 
-// Enums corresponding to ARM condition codes
-namespace ARMCC {
-  // The CondCodes constants map directly to the 4-bit encoding of the
-  // condition field for predicated instructions.
-  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
-    EQ,            // Equal                      Equal
-    NE,            // Not equal                  Not equal, or unordered
-    HS,            // Carry set                  >, ==, or unordered
-    LO,            // Carry clear                Less than
-    MI,            // Minus, negative            Less than
-    PL,            // Plus, positive or zero     >, ==, or unordered
-    VS,            // Overflow                   Unordered
-    VC,            // No overflow                Not unordered
-    HI,            // Unsigned higher            Greater than, or unordered
-    LS,            // Unsigned lower or same     Less than or equal
-    GE,            // Greater than or equal      Greater than or equal
-    LT,            // Less than                  Less than, or unordered
-    GT,            // Greater than               Greater than
-    LE,            // Less than or equal         <, ==, or unordered
-    AL             // Always (unconditional)     Always (unconditional)
-  };
+MCCodeEmitter *createARMMCCodeEmitter(const Target &,
+                                      TargetMachine &TM,
+                                      MCContext &Ctx);
 
-  inline static CondCodes getOppositeCondition(CondCodes CC) {
-    switch (CC) {
-    default: llvm_unreachable("Unknown condition code");
-    case EQ: return NE;
-    case NE: return EQ;
-    case HS: return LO;
-    case LO: return HS;
-    case MI: return PL;
-    case PL: return MI;
-    case VS: return VC;
-    case VC: return VS;
-    case HI: return LS;
-    case LS: return HI;
-    case GE: return LT;
-    case LT: return GE;
-    case GT: return LE;
-    case LE: return GT;
-    }
-  }
-} // namespace ARMCC
-
-inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
-  switch (CC) {
-  default: llvm_unreachable("Unknown condition code");
-  case ARMCC::EQ:  return "eq";
-  case ARMCC::NE:  return "ne";
-  case ARMCC::HS:  return "hs";
-  case ARMCC::LO:  return "lo";
-  case ARMCC::MI:  return "mi";
-  case ARMCC::PL:  return "pl";
-  case ARMCC::VS:  return "vs";
-  case ARMCC::VC:  return "vc";
-  case ARMCC::HI:  return "hi";
-  case ARMCC::LS:  return "ls";
-  case ARMCC::GE:  return "ge";
-  case ARMCC::LT:  return "lt";
-  case ARMCC::GT:  return "gt";
-  case ARMCC::LE:  return "le";
-  case ARMCC::AL:  return "al";
-  }
-}
-
-namespace ARM_MB {
-  // The Memory Barrier Option constants map directly to the 4-bit encoding of
-  // the option field for memory barrier operations.
-  enum MemBOpt {
-    ST    = 14,
-    ISH   = 11,
-    ISHST = 10,
-    NSH   = 7,
-    NSHST = 6,
-    OSH   = 3,
-    OSHST = 2
-  };
-
-  inline static const char *MemBOptToString(unsigned val) {
-    switch (val) {
-    default: llvm_unreachable("Unknown memory opetion");
-    case ST:    return "st";
-    case ISH:   return "ish";
-    case ISHST: return "ishst";
-    case NSH:   return "nsh";
-    case NSHST: return "nshst";
-    case OSH:   return "osh";
-    case OSHST: return "oshst";
-    }
-  }
-} // namespace ARM_MB
+TargetAsmBackend *createARMAsmBackend(const Target &, const std::string &);
 
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
@@ -127,23 +48,16 @@ FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
 FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
 FunctionPass *createARMConstantIslandPass();
-FunctionPass *createNEONPreAllocPass();
 FunctionPass *createNEONMoveFixPass();
+FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createThumb2SizeReductionPass();
 
 extern Target TheARMTarget, TheThumbTarget;
 
-} // end namespace llvm;
-
-// Defines symbolic names for ARM registers.  This defines a mapping from
-// register name to register number.
-//
-#include "ARMGenRegisterNames.inc"
-
-// Defines symbolic names for the ARM instructions.
-//
-#include "ARMGenInstrNames.inc"
+void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                  ARMAsmPrinter &AP);
 
+} // end namespace llvm;
 
 #endif
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
index d6a8f19..bf4315f 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -16,6 +16,7 @@
 
 include "llvm/Target/Target.td"
 
+
 //===----------------------------------------------------------------------===//
 // ARM Subtarget features.
 //
@@ -32,6 +33,8 @@ def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
                                      "Does not support ARM mode execution">;
 def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
                                      "Enable half-precision floating point">;
+def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
+                                     "Restrict VFP3 to 16 double registers">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
                                      "Enable divide instructions">;
 def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
@@ -43,14 +46,11 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
 def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
                           "Floating point unit supports single precision only">;
 
-// Some processors have multiply-accumulate instructions that don't
-// play nicely with other VFP instructions, and it's generally better
+// Some processors have FP multiply-accumulate instructions that don't
+// play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
-// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for
-// others as well. We should do more benchmarking and confirm one way or
-// the other.
-def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true",
-                                          "Disable VFP MAC instructions">;
+def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                         "Disable VFP / NEON MAC instructions">;
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations.
 def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
@@ -61,6 +61,9 @@ def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
 
+// Multiprocessing extension.
+def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                 "Supports Multiprocessing extension">;
 
 // ARM architectures.
 def ArchV4T     : SubtargetFeature<"v4t", "ARMArchVersion", "V4T",
@@ -91,6 +94,18 @@ def ArchV7M     : SubtargetFeature<"v7m", "ARMArchVersion", "V7M",
 
 include "ARMSchedule.td"
 
+// ARM processor families.
+def ProcOthers  : SubtargetFeature<"others", "ARMProcFamily", "Others",
+                                   "One of the other ARM processor families">;
+def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
+                                   "Cortex-A8 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureNEONForFP,
+                                    FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
+def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
+                                   "Cortex-A9 ARM processors",
+                                   [FeatureHasSlowFPVMLx, FeatureT2XtPk,
+                                    FeatureFP16]>;
+
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, GenericItineraries, Features>;
 
@@ -135,25 +150,27 @@ def : ProcNoItin<"iwmmxt",          [ArchV5TE]>;
 // V6 Processors.
 def : Processor<"arm1136j-s",       ARMV6Itineraries, [ArchV6]>;
 def : Processor<"arm1136jf-s",      ARMV6Itineraries, [ArchV6, FeatureVFP2,
-                                                       FeatureHasSlowVMLx]>;
+                                                       FeatureHasSlowFPVMLx]>;
 def : Processor<"arm1176jz-s",      ARMV6Itineraries, [ArchV6]>;
-def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
+def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [ArchV6, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
 def : Processor<"mpcorenovfp",      ARMV6Itineraries, [ArchV6]>;
-def : Processor<"mpcore",           ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
+def : Processor<"mpcore",           ARMV6Itineraries, [ArchV6, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
 
 // V6M Processors.
 def : Processor<"cortex-m0",        ARMV6Itineraries, [ArchV6M]>;
 
 // V6T2 Processors.
 def : Processor<"arm1156t2-s",      ARMV6Itineraries, [ArchV6T2]>;
-def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [ArchV6T2, FeatureVFP2]>;
+def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [ArchV6T2, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
 
 // V7 Processors.
 def : Processor<"cortex-a8",        CortexA8Itineraries,
-                [ArchV7A, FeatureHasSlowVMLx,
-                 FeatureSlowFPBrcc, FeatureNEONForFP, FeatureT2XtPk]>;
+                                    [ArchV7A, ProcA8]>;
 def : Processor<"cortex-a9",        CortexA9Itineraries,
-                [ArchV7A, FeatureT2XtPk]>;
+                                    [ArchV7A, ProcA9]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [ArchV7M]>;
@@ -175,6 +192,17 @@ include "ARMInstrInfo.td"
 
 def ARMInstrInfo : InstrInfo;
 
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// ARM Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def ARMAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
@@ -182,4 +210,6 @@ def ARMInstrInfo : InstrInfo;
 def ARM : Target {
   // Pull in Instruction Info:
   let InstructionSet = ARMInstrInfo;
+
+  let AssemblyWriters = [ARMAsmWriter];
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h
index db48100..19fbf05 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h
+++ b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h
@@ -50,6 +50,16 @@ namespace ARM_AM {
     }
   }
 
+  static inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
+    switch (Op) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::asr: return 2;
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::ror: return 3;
+    }
+  }
+
   static inline ShiftOpc getShiftOpcForNode(SDValue N) {
     switch (N.getOpcode()) {
     default:          return ARM_AM::no_shift;
@@ -566,6 +576,8 @@ namespace ARM_AM {
     return Val;
   }
 
+  AMSubMode getLoadStoreMultipleSubMode(int Opcode);
+
 } // end namespace ARM_AM
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmBackend.cpp
new file mode 100644
index 0000000..ec23449
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmBackend.cpp
@@ -0,0 +1,512 @@
+//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmBackend.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+namespace {
+class ARMMachObjectWriter : public MCMachObjectTargetWriter {
+public:
+  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                               /*UseAggressiveSymbolFolding=*/true) {}
+};
+
+class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  ARMELFObjectWriter(Triple::OSType OSType)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSType, ELF::EM_ARM,
+                              /*HasRelocationAddend*/ false) {}
+};
+
+class ARMAsmBackend : public TargetAsmBackend {
+  bool isThumbMode;  // Currently emitting Thumb code.
+public:
+  ARMAsmBackend(const Target &T) : TargetAsmBackend(), isThumbMode(false) {}
+
+  unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// ARMFixupKinds.h.
+//
+// Name                      Offset (bits) Size (bits)     Flags
+{ "fixup_arm_ldst_pcrel_12", 1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_pcrel_10",      1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_thumb_adr_pcrel_10",0,            8,   MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_adr_pcrel_12",  1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_condbranch",    0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbranch",  0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_blx",     7,            21,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      1,             8,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bcc",     1,             8,  MCFixupKindInfo::FKF_IsPCRel },
+// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
+{ "fixup_arm_movt_hi16",     0,            20,  0 },
+{ "fixup_arm_movw_lo16",     0,            20,  0 },
+{ "fixup_t2_movt_hi16",      0,            20,  0 },
+{ "fixup_t2_movw_lo16",      0,            20,  0 },
+{ "fixup_arm_movt_hi16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_movw_lo16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_movt_hi16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_movw_lo16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return TargetAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  bool MayNeedRelaxation(const MCInst &Inst) const;
+
+  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {
+    switch (Flag) {
+    default: break;
+    case MCAF_Code16:
+      setIsThumb(true);
+      break;
+    case MCAF_Code32:
+      setIsThumb(false);
+      break;
+    }
+  }
+
+  unsigned getPointerSize() const { return 4; }
+  bool isThumb() const { return isThumbMode; }
+  void setIsThumb(bool it) { isThumbMode = it; }
+};
+} // end anonymous namespace
+
+bool ARMAsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
+  // FIXME: Thumb targets, different move constant targets..
+  return false;
+}
+
+void ARMAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  assert(0 && "ARMAsmBackend::RelaxInstruction() unimplemented");
+  return;
+}
+
+bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+  if (isThumb()) {
+    // FIXME: 0xbf00 is the ARMv7 value. For v6 and before, we'll need to
+    // use 0x46c0 (which is a 'mov r8, r8' insn).
+    uint64_t NumNops = Count / 2;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write16(0xbf00);
+    if (Count & 1)
+      OW->Write8(0);
+    return true;
+  }
+  // ARM mode
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(0xe1a00000);
+  switch (Count % 4) {
+  default: break; // No leftover bytes to write
+  case 1: OW->Write8(0); break;
+  case 2: OW->Write16(0); break;
+  case 3: OW->Write16(0); OW->Write8(0xa0); break;
+  }
+
+  return true;
+}
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+    return Value;
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+    Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movw_lo16_pcrel: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned Lo12 = Value & 0x0FFF;
+    // inst{19-16} = Hi4;
+    // inst{11-0} = Lo12;
+    Value = (Hi4 << 16) | (Lo12);
+    return Value;
+  }
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+    Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movw_lo16_pcrel: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned i = (Value & 0x800) >> 11;
+    unsigned Mid3 = (Value & 0x700) >> 8;
+    unsigned Lo8 = Value & 0x0FF;
+    // inst{19-16} = Hi4;
+    // inst{26} = i;
+    // inst{14-12} = Mid3;
+    // inst{7-0} = Lo8;
+    Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
+
+    uint64_t swapped = (Value & 0xFFFF0000) >> 16;
+    swapped |= (Value & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_arm_ldst_pcrel_12:
+    // ARM PC-relative values are offset by 8.
+    Value -= 4;
+    // FALLTHROUGH
+  case ARM::fixup_t2_ldst_pcrel_12: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value -= 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    assert ((Value < 4096) && "Out of range pc-relative fixup value!");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10,
+    // but with 16-bit halfwords swapped.
+    if (Kind == ARM::fixup_t2_ldst_pcrel_12) {
+      uint64_t swapped = (Value & 0xFFFF0000) >> 16;
+      swapped |= (Value & 0x0000FFFF) << 16;
+      return swapped;
+    }
+
+    return Value;
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return ((Value - 4) >> 2) & 0xff;
+  case ARM::fixup_arm_adr_pcrel_12: {
+    // ARM PC-relative values are offset by 8.
+    Value -= 8;
+    unsigned opc = 4; // bits {24-21}. Default to add: 0b0100
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 2; // 0b0010
+    }
+    assert(ARM_AM::getSOImmVal(Value) != -1 &&
+           "Out of range pc-relative fixup value!");
+    // Encode the immediate and shift the opcode into place.
+    return ARM_AM::getSOImmVal(Value) | (opc << 21);
+  }
+
+  case ARM::fixup_t2_adr_pcrel_12: {
+    Value -= 4;
+    unsigned opc = 0;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 5;
+    }
+
+    uint32_t out = (opc << 21);
+    out |= (Value & 0x800) << 14;
+    out |= (Value & 0x700) << 4;
+    out |= (Value & 0x0FF);
+
+    uint64_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    // These values don't encode the low two bits since they're always zero.
+    // Offset by 8 just as above.
+    return 0xffffff & ((Value - 8) >> 2);
+  case ARM::fixup_t2_uncondbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint32_t out = 0;
+    bool I =  Value & 0x800000;
+    bool J1 = Value & 0x400000;
+    bool J2 = Value & 0x200000;
+    J1 ^= I;
+    J2 ^= I;
+
+    out |= I  << 26; // S bit
+    out |= !J1 << 13; // J1 bit
+    out |= !J2 << 11; // J2 bit
+    out |= (Value & 0x1FF800)  << 5; // imm6 field
+    out |= (Value & 0x0007FF);        // imm11 field
+
+    uint64_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_t2_condbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint64_t out = 0;
+    out |= (Value & 0x80000) << 7; // S bit
+    out |= (Value & 0x40000) >> 7; // J2 bit
+    out |= (Value & 0x20000) >> 4; // J1 bit
+    out |= (Value & 0x1F800) << 5; // imm6 field
+    out |= (Value & 0x007FF);      // imm11 field
+
+    uint32_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_arm_thumb_bl: {
+    // The value doesn't encode the low bit (always zero) and is offset by
+    // four. The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit
+    //
+    //   BL:  xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0;
+    uint32_t Binary = 0;
+    Value = 0x3fffff & ((Value - 4) >> 1);
+    Binary  = (Value & 0x7ff) << 16;    // Low imm11 value.
+    Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value.
+    Binary |= isNeg << 10;              // Sign bit.
+    return Binary;
+  }
+  case ARM::fixup_arm_thumb_blx: {
+    // The value doesn't encode the low two bits (always zero) and is offset by
+    // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit
+    // positions in the destination opcode. x = unchanged, I = immediate value
+    // bit, S = sign extension bit, 0 = zero.
+    //
+    //   BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0;
+    uint32_t Binary = 0;
+    Value = 0xfffff & ((Value - 2) >> 2);
+    Binary  = (Value & 0x3ff) << 17;    // Low imm10L value.
+    Binary |= (Value & 0xffc00) >> 10;  // High imm10H value.
+    Binary |= isNeg << 10;              // Sign bit.
+    return Binary;
+  }
+  case ARM::fixup_arm_thumb_cp:
+    // Offset by 4, and don't encode the low two bits. Two bytes of that
+    // 'off by 4' is implicitly handled by the half-word ordering of the
+    // Thumb encoding, so we only need to adjust by 2 here.
+    return ((Value - 2) >> 2) & 0xff;
+  case ARM::fixup_arm_thumb_cb: {
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    uint32_t Binary = (Value - 4) >> 1;
+    return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
+  }
+  case ARM::fixup_arm_thumb_br:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0x7ff;
+  case ARM::fixup_arm_thumb_bcc:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0xff;
+  case ARM::fixup_arm_pcrel_10:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    // Fall through.
+  case ARM::fixup_t2_pcrel_10: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low two bits since they're always zero.
+    Value >>= 2;
+    assert ((Value < 256) && "Out of range pc-relative fixup value!");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10,
+    // but with 16-bit halfwords swapped.
+    if (Kind == ARM::fixup_t2_pcrel_10) {
+      uint32_t swapped = (Value & 0xFFFF0000) >> 16;
+      swapped |= (Value & 0x0000FFFF) << 16;
+      return swapped;
+    }
+
+    return Value;
+  }
+  }
+}
+
+namespace {
+
+// FIXME: This should be in a separate file.
+// ELF is an ELF of course...
+class ELFARMAsmBackend : public ARMAsmBackend {
+public:
+  Triple::OSType OSType;
+  ELFARMAsmBackend(const Target &T, Triple::OSType _OSType)
+    : ARMAsmBackend(T), OSType(_OSType) { }
+
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(new ARMELFObjectWriter(OSType), OS,
+                              /*IsLittleEndian*/ true);
+  }
+};
+
+// FIXME: Raise this to share code between Darwin and ELF.
+void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
+                                  unsigned DataSize, uint64_t Value) const {
+  unsigned NumBytes = 4;        // FIXME: 2 for Thumb
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+  if (!Value) return;           // Doesn't change encoding.
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset % NumBytes == 0 && "Offset mod NumBytes is nonzero!");
+
+  // For each byte of the fragment that the fixup touches, mask in the bits from
+  // the fixup value. The Value has been "split up" into the appropriate
+  // bitfields above.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+// FIXME: This should be in a separate file.
+class DarwinARMAsmBackend : public ARMAsmBackend {
+public:
+  DarwinARMAsmBackend(const Target &T) : ARMAsmBackend(T) { }
+
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    // FIXME: Subtarget info should be derived. Force v7 for now.
+    return createMachObjectWriter(new ARMMachObjectWriter(
+                                    /*Is64Bit=*/false,
+                                    object::mach::CTM_ARM,
+                                    object::mach::CSARM_V7),
+                                  OS,
+                                  /*IsLittleEndian=*/true);
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    return false;
+  }
+};
+
+/// getFixupKindNumBytes - The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return 1;
+
+  case FK_Data_2:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+    return 2;
+
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    return 3;
+
+  case FK_Data_4:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+  case ARM::fixup_arm_movw_lo16_pcrel:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+  case ARM::fixup_t2_movw_lo16_pcrel:
+    return 4;
+  }
+}
+
+void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
+                                     unsigned DataSize, uint64_t Value) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+  if (!Value) return;           // Doesn't change encoding.
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+} // end anonymous namespace
+
+TargetAsmBackend *llvm::createARMAsmBackend(const Target &T,
+                                            const std::string &TT) {
+  switch (Triple(TT).getOS()) {
+  case Triple::Darwin:
+    return new DarwinARMAsmBackend(T);
+  case Triple::MinGW32:
+  case Triple::Cygwin:
+  case Triple::Win32:
+    assert(0 && "Windows not supported on ARM");
+  default:
+    return new ELFARMAsmBackend(T, Triple(TT).getOS());
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 6cfd596..db12b8e 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -14,28 +14,31 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "ARM.h"
-#include "ARMBuildAttrs.h"
+#include "ARMAsmPrinter.h"
 #include "ARMAddressingModes.h"
+#include "ARMBuildAttrs.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
-#include "AsmPrinter/ARMInstPrinter.h"
 #include "ARMMachineFunctionInfo.h"
-#include "ARMMCInstLower.h"
+#include "ARMMCExpr.h"
 #include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "InstPrinter/ARMInstPrinter.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
@@ -53,270 +56,127 @@
 #include <cctype>
 using namespace llvm;
 
-static cl::opt<bool>
-EnableMCInst("enable-arm-mcinst-printer", cl::Hidden,
-            cl::desc("enable experimental asmprinter gunk in the arm backend"));
-
-namespace llvm {
-  namespace ARM {
-    enum DW_ISA {
-      DW_ISA_ARM_thumb = 1,
-      DW_ISA_ARM_arm = 2
-    };
-  }
-}
-
 namespace {
-  class ARMAsmPrinter : public AsmPrinter {
-
-    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-    /// make the right decision when printing asm code for different targets.
-    const ARMSubtarget *Subtarget;
 
-    /// AFI - Keep a pointer to ARMFunctionInfo for the current
-    /// MachineFunction.
-    ARMFunctionInfo *AFI;
+  // Per section and per symbol attributes are not supported.
+  // To implement them we would need the ability to delay this emission
+  // until the assembly file is fully parsed/generated as only then do we
+  // know the symbol and section numbers.
+  class AttributeEmitter {
+  public:
+    virtual void MaybeSwitchVendor(StringRef Vendor) = 0;
+    virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0;
+    virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0;
+    virtual void Finish() = 0;
+    virtual ~AttributeEmitter() {}
+  };
 
-    /// MCP - Keep a pointer to constantpool entries of the current
-    /// MachineFunction.
-    const MachineConstantPool *MCP;
+  class AsmAttributeEmitter : public AttributeEmitter {
+    MCStreamer &Streamer;
 
   public:
-    explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) {
-      Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    }
+    AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {}
+    void MaybeSwitchVendor(StringRef Vendor) { }
 
-    virtual const char *getPassName() const {
-      return "ARM Assembly Printer";
+    void EmitAttribute(unsigned Attribute, unsigned Value) {
+      Streamer.EmitRawText("\t.eabi_attribute " +
+                           Twine(Attribute) + ", " + Twine(Value));
     }
 
-    void printInstructionThroughMCStreamer(const MachineInstr *MI);
-
-
-    void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
-                      const char *Modifier = 0);
-    void printSOImmOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
-    void printSOImm2PartOperand(const MachineInstr *MI, int OpNum,
-                                raw_ostream &O);
-    void printSORegOperand(const MachineInstr *MI, int OpNum,
-                           raw_ostream &O);
-    void printAddrMode2Operand(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O);
-    void printAddrMode2OffsetOperand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printAddrMode3Operand(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O);
-    void printAddrMode3OffsetOperand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printAddrMode4Operand(const MachineInstr *MI, int OpNum,raw_ostream &O,
-                               const char *Modifier = 0);
-    void printAddrMode5Operand(const MachineInstr *MI, int OpNum,raw_ostream &O,
-                               const char *Modifier = 0);
-    void printAddrMode6Operand(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O);
-    void printAddrMode6OffsetOperand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printAddrModePCOperand(const MachineInstr *MI, int OpNum,
-                                raw_ostream &O,
-                                const char *Modifier = 0);
-    void printBitfieldInvMaskImmOperand(const MachineInstr *MI, int OpNum,
-                                        raw_ostream &O);
-    void printMemBOption(const MachineInstr *MI, int OpNum,
-                         raw_ostream &O);
-    void printShiftImmOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O);
-
-    void printThumbS4ImmOperand(const MachineInstr *MI, int OpNum,
-                                raw_ostream &O);
-    void printThumbITMask(const MachineInstr *MI, int OpNum, raw_ostream &O);
-    void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNum,
-                                      raw_ostream &O,
-                                      unsigned Scale);
-    void printThumbAddrModeS1Operand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printThumbAddrModeS2Operand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-
-    void printT2SOOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
-    void printT2AddrModeImm12Operand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-    void printT2AddrModeImm8Operand(const MachineInstr *MI, int OpNum,
-                                    raw_ostream &O);
-    void printT2AddrModeImm8s4Operand(const MachineInstr *MI, int OpNum,
-                                      raw_ostream &O);
-    void printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum,
-                                          raw_ostream &O);
-    void printT2AddrModeImm8s4OffsetOperand(const MachineInstr *MI, int OpNum,
-                                            raw_ostream &O) {}
-    void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O);
-
-    void printCPSOptionOperand(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O) {}
-    void printMSRMaskOperand(const MachineInstr *MI, int OpNum,
-                             raw_ostream &O) {}
-    void printNegZeroOperand(const MachineInstr *MI, int OpNum,
-                             raw_ostream &O) {}
-    void printPredicateOperand(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O);
-    void printMandatoryPredicateOperand(const MachineInstr *MI, int OpNum,
-                                        raw_ostream &O);
-    void printSBitModifierOperand(const MachineInstr *MI, int OpNum,
-                                  raw_ostream &O);
-    void printPCLabel(const MachineInstr *MI, int OpNum,
-                      raw_ostream &O);
-    void printRegisterList(const MachineInstr *MI, int OpNum,
-                           raw_ostream &O);
-    void printCPInstOperand(const MachineInstr *MI, int OpNum,
-                            raw_ostream &O,
-                            const char *Modifier);
-    void printJTBlockOperand(const MachineInstr *MI, int OpNum,
-                             raw_ostream &O);
-    void printJT2BlockOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O);
-    void printTBAddrMode(const MachineInstr *MI, int OpNum,
-                         raw_ostream &O);
-    void printNoHashImmediate(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O);
-    void printVFPf32ImmOperand(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O);
-    void printVFPf64ImmOperand(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O);
-    void printNEONModImmOperand(const MachineInstr *MI, int OpNum,
-                                raw_ostream &O);
-
-    virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                                 unsigned AsmVariant, const char *ExtraCode,
-                                 raw_ostream &O);
-    virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                                       unsigned AsmVariant,
-                                       const char *ExtraCode, raw_ostream &O);
-
-    void printInstruction(const MachineInstr *MI, raw_ostream &O); // autogen
-    static const char *getRegisterName(unsigned RegNo);
-
-    virtual void EmitInstruction(const MachineInstr *MI);
-    bool runOnMachineFunction(MachineFunction &F);
-
-    virtual void EmitConstantPool() {} // we emit constant pools customly!
-    virtual void EmitFunctionEntryLabel();
-    void EmitStartOfAsmFile(Module &M);
-    void EmitEndOfAsmFile(Module &M);
-
-    MachineLocation getDebugValueLocation(const MachineInstr *MI) const {
-      MachineLocation Location;
-      assert (MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-      // Frame address.  Currently handles register +- offset only.
-      if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-        Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-      else {
-        DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+    void EmitTextAttribute(unsigned Attribute, StringRef String) {
+      switch (Attribute) {
+      case ARMBuildAttrs::CPU_name:
+        Streamer.EmitRawText(StringRef("\t.cpu ") + LowercaseString(String));
+        break;
+      default: assert(0 && "Unsupported Text attribute in ASM Mode"); break;
       }
-      return Location;
+    }
+    void Finish() { }
+  };
+
+  class ObjectAttributeEmitter : public AttributeEmitter {
+    MCObjectStreamer &Streamer;
+    StringRef CurrentVendor;
+    SmallString<64> Contents;
+
+  public:
+    ObjectAttributeEmitter(MCObjectStreamer &Streamer_) :
+      Streamer(Streamer_), CurrentVendor("") { }
+
+    void MaybeSwitchVendor(StringRef Vendor) {
+      assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+      if (CurrentVendor.empty())
+        CurrentVendor = Vendor;
+      else if (CurrentVendor == Vendor)
+        return;
+      else
+        Finish();
+
+      CurrentVendor = Vendor;
+
+      assert(Contents.size() == 0);
     }
 
-    virtual unsigned getISAEncoding() {
-      // ARM/Darwin adds ISA to the DWARF info for each function.
-      if (!Subtarget->isTargetDarwin())
-        return 0;
-      return Subtarget->isThumb() ?
-        llvm::ARM::DW_ISA_ARM_thumb : llvm::ARM::DW_ISA_ARM_arm;
+    void EmitAttribute(unsigned Attribute, unsigned Value) {
+      // FIXME: should be ULEB
+      Contents += Attribute;
+      Contents += Value;
     }
 
-    MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
-                                          const MachineBasicBlock *MBB) const;
-    MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const;
-
-    /// EmitMachineConstantPoolValue - Print a machine constantpool value to
-    /// the .s file.
-    virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
-      SmallString<128> Str;
-      raw_svector_ostream OS(Str);
-      EmitMachineConstantPoolValue(MCPV, OS);
-      OutStreamer.EmitRawText(OS.str());
+    void EmitTextAttribute(unsigned Attribute, StringRef String) {
+      Contents += Attribute;
+      Contents += UppercaseString(String);
+      Contents += 0;
     }
 
-    void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV,
-                                      raw_ostream &O) {
-      switch (TM.getTargetData()->getTypeAllocSize(MCPV->getType())) {
-      case 1: O << MAI->getData8bitsDirective(0); break;
-      case 2: O << MAI->getData16bitsDirective(0); break;
-      case 4: O << MAI->getData32bitsDirective(0); break;
-      default: assert(0 && "Unknown CPV size");
-      }
+    void Finish() {
+      const size_t ContentsSize = Contents.size();
 
-      ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
-
-      if (ACPV->isLSDA()) {
-        O << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
-      } else if (ACPV->isBlockAddress()) {
-        O << *GetBlockAddressSymbol(ACPV->getBlockAddress());
-      } else if (ACPV->isGlobalValue()) {
-        const GlobalValue *GV = ACPV->getGV();
-        bool isIndirect = Subtarget->isTargetDarwin() &&
-          Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
-        if (!isIndirect)
-          O << *Mang->getSymbol(GV);
-        else {
-          // FIXME: Remove this when Darwin transition to @GOT like syntax.
-          MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-          O << *Sym;
-
-          MachineModuleInfoMachO &MMIMachO =
-            MMI->getObjFileInfo<MachineModuleInfoMachO>();
-          MachineModuleInfoImpl::StubValueTy &StubSym =
-            GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(Sym) :
-                                        MMIMachO.getGVStubEntry(Sym);
-          if (StubSym.getPointer() == 0)
-            StubSym = MachineModuleInfoImpl::
-              StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
-        }
-      } else {
-        assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
-        O << *GetExternalSymbolSymbol(ACPV->getSymbol());
-      }
+      // Vendor size + Vendor name + '\0'
+      const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
 
-      if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")";
-      if (ACPV->getPCAdjustment() != 0) {
-        O << "-(" << MAI->getPrivateGlobalPrefix() << "PC"
-          << getFunctionNumber() << "_"  << ACPV->getLabelId()
-          << "+" << (unsigned)ACPV->getPCAdjustment();
-         if (ACPV->mustAddCurrentAddress())
-           O << "-.";
-         O << ')';
-      }
+      // Tag + Tag Size
+      const size_t TagHeaderSize = 1 + 4;
+
+      Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+      Streamer.EmitBytes(CurrentVendor, 0);
+      Streamer.EmitIntValue(0, 1); // '\0'
+
+      Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+      Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+      Streamer.EmitBytes(Contents, 0);
+
+      Contents.clear();
     }
   };
+
 } // end of anonymous namespace
 
-#include "ARMGenAsmWriter.inc"
+MachineLocation ARMAsmPrinter::
+getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
 
 void ARMAsmPrinter::EmitFunctionEntryLabel() {
   if (AFI->isThumbFunction()) {
-    OutStreamer.EmitRawText(StringRef("\t.code\t16"));
-    if (!Subtarget->isTargetDarwin())
-      OutStreamer.EmitRawText(StringRef("\t.thumb_func"));
-    else {
-      // This needs to emit to a temporary string to get properly quoted
-      // MCSymbols when they have spaces in them.
-      SmallString<128> Tmp;
-      raw_svector_ostream OS(Tmp);
-      OS << "\t.thumb_func\t" << *CurrentFnSym;
-      OutStreamer.EmitRawText(OS.str());
-    }
+    OutStreamer.EmitAssemblerFlag(MCAF_Code16);
+    OutStreamer.EmitThumbFunc(Subtarget->isTargetDarwin()? CurrentFnSym : 0);
   }
 
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
-/// runOnMachineFunction - This uses the printInstruction()
+/// runOnMachineFunction - This uses the EmitInstruction()
 /// method to print assembly for each instruction.
 ///
 bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -337,32 +197,18 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   case MachineOperand::MO_Register: {
     unsigned Reg = MO.getReg();
     assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
-      unsigned DRegLo = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_0);
-      unsigned DRegHi = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_1);
-      O << '{'
-        << getRegisterName(DRegLo) << ", " << getRegisterName(DRegHi)
-        << '}';
-    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
-      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
-      unsigned DReg =
-        TM.getRegisterInfo()->getMatchingSuperReg(Reg,
-          RegNum & 1 ? ARM::ssub_1 : ARM::ssub_0, &ARM::DPR_VFP2RegClass);
-      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
-    } else {
-      assert(!MO.getSubReg() && "Subregs should be eliminated!");
-      O << getRegisterName(Reg);
-    }
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << ARMInstPrinter::getRegisterName(Reg);
     break;
   }
   case MachineOperand::MO_Immediate: {
     int64_t Imm = MO.getImm();
     O << '#';
     if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
-        (TF & ARMII::MO_LO16))
+        (TF == ARMII::MO_LO16))
       O << ":lower16:";
     else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
-             (TF & ARMII::MO_HI16))
+             (TF == ARMII::MO_HI16))
       O << ":upper16:";
     O << Imm;
     break;
@@ -371,9 +217,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     O << *MO.getMBB()->getSymbol();
     return;
   case MachineOperand::MO_GlobalAddress: {
-    bool isCallOp = Modifier && !strcmp(Modifier, "call");
     const GlobalValue *GV = MO.getGlobal();
-
     if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
         (TF & ARMII::MO_LO16))
       O << ":lower16:";
@@ -383,18 +227,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     O << *Mang->getSymbol(GV);
 
     printOffset(MO.getOffset(), O);
-
-    if (isCallOp && Subtarget->isTargetELF() &&
-        TM.getRelocationModel() == Reloc::PIC_)
+    if (TF == ARMII::MO_PLT)
       O << "(PLT)";
     break;
   }
   case MachineOperand::MO_ExternalSymbol: {
-    bool isCallOp = Modifier && !strcmp(Modifier, "call");
     O << *GetExternalSymbolSymbol(MO.getSymbolName());
-
-    if (isCallOp && Subtarget->isTargetELF() &&
-        TM.getRelocationModel() == Reloc::PIC_)
+    if (TF == ARMII::MO_PLT)
       O << "(PLT)";
     break;
   }
@@ -407,538 +246,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   }
 }
 
-static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
-                       const MCAsmInfo *MAI) {
-  // Break it up into two parts that make up a shifter immediate.
-  V = ARM_AM::getSOImmVal(V);
-  assert(V != -1 && "Not a valid so_imm value!");
-
-  unsigned Imm = ARM_AM::getSOImmValImm(V);
-  unsigned Rot = ARM_AM::getSOImmValRot(V);
-
-  // Print low-level immediate formation info, per
-  // A5.1.3: "Data-processing operands - Immediate".
-  if (Rot) {
-    O << "#" << Imm << ", " << Rot;
-    // Pretty printed version.
-    if (VerboseAsm) {
-      O << "\t" << MAI->getCommentString() << ' ';
-      O << (int)ARM_AM::rotr32(Imm, Rot);
-    }
-  } else {
-    O << "#" << Imm;
-  }
-}
-
-/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit
-/// immediate in bits 0-7.
-void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum,
-                                      raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isImm() && "Not a valid so_imm value!");
-  printSOImm(O, MO.getImm(), isVerbose(), MAI);
-}
-
-/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
-/// followed by an 'orr' to materialize.
-void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum,
-                                           raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isImm() && "Not a valid so_imm value!");
-  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm());
-  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm());
-  printSOImm(O, V1, isVerbose(), MAI);
-  O << "\n\torr";
-  printPredicateOperand(MI, 2, O);
-  O << "\t";
-  printOperand(MI, 0, O);
-  O << ", ";
-  printOperand(MI, 0, O);
-  O << ", ";
-  printSOImm(O, V2, isVerbose(), MAI);
-}
-
-// so_reg is a 4-operand unit corresponding to register forms of the A5.1
-// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
-//    REG 0   0           - e.g. R5
-//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
-//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
-void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op,
-                                      raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-  const MachineOperand &MO3 = MI->getOperand(Op+2);
-
-  O << getRegisterName(MO1.getReg());
-
-  // Print the shift opc.
-  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
-  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
-  if (MO2.getReg()) {
-    O << ' ' << getRegisterName(MO2.getReg());
-    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
-  } else if (ShOpc != ARM_AM::rrx) {
-    O << " #" << ARM_AM::getSORegOffset(MO3.getImm());
-  }
-}
-
-void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op,
-                                          raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-  const MachineOperand &MO3 = MI->getOperand(Op+2);
-
-  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
-    printOperand(MI, Op, O);
-    return;
-  }
-
-  O << "[" << getRegisterName(MO1.getReg());
-
-  if (!MO2.getReg()) {
-    if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0.
-      O << ", #"
-        << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-        << ARM_AM::getAM2Offset(MO3.getImm());
-    O << "]";
-    return;
-  }
-
-  O << ", "
-    << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-    << getRegisterName(MO2.getReg());
-
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
-    O << ", "
-      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
-      << " #" << ShImm;
-  O << "]";
-}
-
-void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op,
-                                                raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-
-  if (!MO1.getReg()) {
-    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
-    O << "#"
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
-      << ImmOffs;
-    return;
-  }
-
-  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
-    << getRegisterName(MO1.getReg());
-
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
-    O << ", "
-      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
-      << " #" << ShImm;
-}
-
-void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op,
-                                          raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-  const MachineOperand &MO3 = MI->getOperand(Op+2);
-
-  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
-  O << "[" << getRegisterName(MO1.getReg());
-
-  if (MO2.getReg()) {
-    O << ", "
-      << (char)ARM_AM::getAM3Op(MO3.getImm())
-      << getRegisterName(MO2.getReg())
-      << "]";
-    return;
-  }
-
-  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
-    O << ", #"
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
-      << ImmOffs;
-  O << "]";
-}
-
-void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op,
-                                                raw_ostream &O){
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-
-  if (MO1.getReg()) {
-    O << (char)ARM_AM::getAM3Op(MO2.getImm())
-      << getRegisterName(MO1.getReg());
-    return;
-  }
-
-  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
-  O << "#"
-    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()))
-    << ImmOffs;
-}
-
-void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
-                                          raw_ostream &O,
-                                          const char *Modifier) {
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
-  if (Modifier && strcmp(Modifier, "submode") == 0) {
-    O << ARM_AM::getAMSubModeStr(Mode);
-  } else if (Modifier && strcmp(Modifier, "wide") == 0) {
-    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
-    if (Mode == ARM_AM::ia)
-      O << ".w";
-  } else {
-    printOperand(MI, Op, O);
-  }
-}
-
-void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
-                                          raw_ostream &O,
-                                          const char *Modifier) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-
-  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
-    printOperand(MI, Op, O);
-    return;
-  }
-
-  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
-
-  O << "[" << getRegisterName(MO1.getReg());
-
-  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
-    O << ", #"
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
-      << ImmOffs*4;
-  }
-  O << "]";
-}
-
-void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op,
-                                          raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-
-  O << "[" << getRegisterName(MO1.getReg());
-  if (MO2.getImm()) {
-    // FIXME: Both darwin as and GNU as violate ARM docs here.
-    O << ", :" << (MO2.getImm() << 3);
-  }
-  O << "]";
-}
-
-void ARMAsmPrinter::printAddrMode6OffsetOperand(const MachineInstr *MI, int Op,
-                                                raw_ostream &O){
-  const MachineOperand &MO = MI->getOperand(Op);
-  if (MO.getReg() == 0)
-    O << "!";
-  else
-    O << ", " << getRegisterName(MO.getReg());
-}
-
-void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op,
-                                           raw_ostream &O,
-                                           const char *Modifier) {
-  if (Modifier && strcmp(Modifier, "label") == 0) {
-    printPCLabel(MI, Op+1, O);
-    return;
-  }
-
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
-  O << "[pc, " << getRegisterName(MO1.getReg()) << "]";
-}
-
-void
-ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op,
-                                              raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(Op);
-  uint32_t v = ~MO.getImm();
-  int32_t lsb = CountTrailingZeros_32(v);
-  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
-  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
-  O << "#" << lsb << ", #" << width;
-}
-
-void
-ARMAsmPrinter::printMemBOption(const MachineInstr *MI, int OpNum,
-                               raw_ostream &O) {
-  unsigned val = MI->getOperand(OpNum).getImm();
-  O << ARM_MB::MemBOptToString(val);
-}
-
-void ARMAsmPrinter::printShiftImmOperand(const MachineInstr *MI, int OpNum,
-                                         raw_ostream &O) {
-  unsigned ShiftOp = MI->getOperand(OpNum).getImm();
-  ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
-  switch (Opc) {
-  case ARM_AM::no_shift:
-    return;
-  case ARM_AM::lsl:
-    O << ", lsl #";
-    break;
-  case ARM_AM::asr:
-    O << ", asr #";
-    break;
-  default:
-    assert(0 && "unexpected shift opcode for shift immediate operand");
-  }
-  O << ARM_AM::getSORegOffset(ShiftOp);
-}
-
-//===--------------------------------------------------------------------===//
-
-void ARMAsmPrinter::printThumbS4ImmOperand(const MachineInstr *MI, int Op,
-                                           raw_ostream &O) {
-  O << "#" <<  MI->getOperand(Op).getImm() * 4;
-}
-
-void
-ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op,
-                                raw_ostream &O) {
-  // (3 - the number of trailing zeros) is the number of then / else.
-  unsigned Mask = MI->getOperand(Op).getImm();
-  unsigned CondBit0 = Mask >> 4 & 1;
-  unsigned NumTZ = CountTrailingZeros_32(Mask);
-  assert(NumTZ <= 3 && "Invalid IT mask!");
-  for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
-    bool T = ((Mask >> Pos) & 1) == CondBit0;
-    if (T)
-      O << 't';
-    else
-      O << 'e';
-  }
-}
-
-void
-ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op,
-                                           raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-  O << "[" << getRegisterName(MO1.getReg());
-  O << ", " << getRegisterName(MO2.getReg()) << "]";
-}
-
-void
-ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op,
-                                            raw_ostream &O,
-                                            unsigned Scale) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-  const MachineOperand &MO3 = MI->getOperand(Op+2);
-
-  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
-    printOperand(MI, Op, O);
-    return;
-  }
-
-  O << "[" << getRegisterName(MO1.getReg());
-  if (MO3.getReg())
-    O << ", " << getRegisterName(MO3.getReg());
-  else if (unsigned ImmOffs = MO2.getImm())
-    O << ", #" << ImmOffs * Scale;
-  O << "]";
-}
-
-void
-ARMAsmPrinter::printThumbAddrModeS1Operand(const MachineInstr *MI, int Op,
-                                           raw_ostream &O) {
-  printThumbAddrModeRI5Operand(MI, Op, O, 1);
-}
-void
-ARMAsmPrinter::printThumbAddrModeS2Operand(const MachineInstr *MI, int Op,
-                                           raw_ostream &O) {
-  printThumbAddrModeRI5Operand(MI, Op, O, 2);
-}
-void
-ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op,
-                                           raw_ostream &O) {
-  printThumbAddrModeRI5Operand(MI, Op, O, 4);
-}
-
-void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op,
-                                                raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(Op);
-  const MachineOperand &MO2 = MI->getOperand(Op+1);
-  O << "[" << getRegisterName(MO1.getReg());
-  if (unsigned ImmOffs = MO2.getImm())
-    O << ", #" << ImmOffs*4;
-  O << "]";
-}
-
-//===--------------------------------------------------------------------===//
-
-// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
-// register with shift forms.
-// REG 0   0           - e.g. R5
-// REG IMM, SH_OPC     - e.g. R5, LSL #3
-void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum,
-                                     raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
-
-  unsigned Reg = MO1.getReg();
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-  O << getRegisterName(Reg);
-
-  // Print the shift opc.
-  assert(MO2.isImm() && "Not a valid t2_so_reg value!");
-  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
-  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
-  if (ShOpc != ARM_AM::rrx)
-    O << " #" << ARM_AM::getSORegOffset(MO2.getImm());
-}
-
-void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI,
-                                                int OpNum,
-                                                raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
-
-  O << "[" << getRegisterName(MO1.getReg());
-
-  unsigned OffImm = MO2.getImm();
-  if (OffImm)  // Don't print +0.
-    O << ", #" << OffImm;
-  O << "]";
-}
-
-void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI,
-                                               int OpNum,
-                                               raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
-
-  O << "[" << getRegisterName(MO1.getReg());
-
-  int32_t OffImm = (int32_t)MO2.getImm();
-  // Don't print +0.
-  if (OffImm < 0)
-    O << ", #-" << -OffImm;
-  else if (OffImm > 0)
-    O << ", #" << OffImm;
-  O << "]";
-}
-
-void ARMAsmPrinter::printT2AddrModeImm8s4Operand(const MachineInstr *MI,
-                                                 int OpNum,
-                                                 raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
-
-  O << "[" << getRegisterName(MO1.getReg());
-
-  int32_t OffImm = (int32_t)MO2.getImm() / 4;
-  // Don't print +0.
-  if (OffImm < 0)
-    O << ", #-" << -OffImm * 4;
-  else if (OffImm > 0)
-    O << ", #" << OffImm * 4;
-  O << "]";
-}
-
-void ARMAsmPrinter::printT2AddrModeImm8OffsetOperand(const MachineInstr *MI,
-                                                     int OpNum,
-                                                     raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  int32_t OffImm = (int32_t)MO1.getImm();
-  // Don't print +0.
-  if (OffImm < 0)
-    O << "#-" << -OffImm;
-  else if (OffImm > 0)
-    O << "#" << OffImm;
-}
-
-void ARMAsmPrinter::printT2AddrModeSoRegOperand(const MachineInstr *MI,
-                                                int OpNum,
-                                                raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
-  const MachineOperand &MO3 = MI->getOperand(OpNum+2);
-
-  O << "[" << getRegisterName(MO1.getReg());
-
-  assert(MO2.getReg() && "Invalid so_reg load / store address!");
-  O << ", " << getRegisterName(MO2.getReg());
-
-  unsigned ShAmt = MO3.getImm();
-  if (ShAmt) {
-    assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
-    O << ", lsl #" << ShAmt;
-  }
-  O << "]";
-}
-
-
 //===--------------------------------------------------------------------===//
 
-void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int OpNum,
-                                          raw_ostream &O) {
-  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
-  if (CC != ARMCC::AL)
-    O << ARMCondCodeToString(CC);
-}
-
-void ARMAsmPrinter::printMandatoryPredicateOperand(const MachineInstr *MI,
-                                                   int OpNum,
-                                                   raw_ostream &O) {
-  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
-  O << ARMCondCodeToString(CC);
-}
-
-void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum,
-                                             raw_ostream &O){
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-  if (Reg) {
-    assert(Reg == ARM::CPSR && "Expect ARM CPSR register!");
-    O << 's';
-  }
-}
-
-void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int OpNum,
-                                 raw_ostream &O) {
-  int Id = (int)MI->getOperand(OpNum).getImm();
-  O << MAI->getPrivateGlobalPrefix()
-    << "PC" << getFunctionNumber() << "_" << Id;
-}
-
-void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int OpNum,
-                                      raw_ostream &O) {
-  O << "{";
-  for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
-    if (MI->getOperand(i).isImplicit())
-      continue;
-    if ((int)i != OpNum) O << ", ";
-    printOperand(MI, i, O);
-  }
-  O << "}";
-}
-
-void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum,
-                                       raw_ostream &O, const char *Modifier) {
-  assert(Modifier && "This operand only works with a modifier!");
-  // There are two aspects to a CONSTANTPOOL_ENTRY operand, the label and the
-  // data itself.
-  if (!strcmp(Modifier, "label")) {
-    unsigned ID = MI->getOperand(OpNum).getImm();
-    OutStreamer.EmitLabel(GetCPISymbol(ID));
-  } else {
-    assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE");
-    unsigned CPI = MI->getOperand(OpNum).getIndex();
-
-    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
-
-    if (MCPE.isMachineConstantPoolEntry()) {
-      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
-    } else {
-      EmitGlobalConstant(MCPE.Val.ConstVal);
-    }
-  }
-}
-
 MCSymbol *ARMAsmPrinter::
 GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
                             const MachineBasicBlock *MBB) const {
@@ -957,126 +266,12 @@ GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
   return OutContext.GetOrCreateSymbol(Name.str());
 }
 
-void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum,
-                                        raw_ostream &O) {
-  assert(!Subtarget->isThumb2() && "Thumb2 should use double-jump jumptables!");
-
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
-
-  unsigned JTI = MO1.getIndex();
-  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
-  // Can't use EmitLabel until instprinter happens, label comes out in the wrong
-  // order.
-  O << "\n" << *JTISymbol << ":\n";
-
-  const char *JTEntryDirective = MAI->getData32bitsDirective();
-
-  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
-  bool UseSet= MAI->hasSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
-  SmallPtrSet<MachineBasicBlock*, 8> JTSets;
-  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = JTBBs[i];
-    bool isNew = JTSets.insert(MBB);
-
-    if (UseSet && isNew) {
-      O << "\t.set\t"
-        << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB) << ','
-        << *MBB->getSymbol() << '-' << *JTISymbol << '\n';
-    }
-
-    O << JTEntryDirective << ' ';
-    if (UseSet)
-      O << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB);
-    else if (TM.getRelocationModel() == Reloc::PIC_)
-      O << *MBB->getSymbol() << '-' << *JTISymbol;
-    else
-      O << *MBB->getSymbol();
-
-    if (i != e-1)
-      O << '\n';
-  }
-}
-
-void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum,
-                                         raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
-  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
-  unsigned JTI = MO1.getIndex();
-
-  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
-
-  // Can't use EmitLabel until instprinter happens, label comes out in the wrong
-  // order.
-  O << "\n" << *JTISymbol << ":\n";
-
-  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
-  bool ByteOffset = false, HalfWordOffset = false;
-  if (MI->getOpcode() == ARM::t2TBB)
-    ByteOffset = true;
-  else if (MI->getOpcode() == ARM::t2TBH)
-    HalfWordOffset = true;
-
-  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = JTBBs[i];
-    if (ByteOffset)
-      O << MAI->getData8bitsDirective();
-    else if (HalfWordOffset)
-      O << MAI->getData16bitsDirective();
-
-    if (ByteOffset || HalfWordOffset)
-      O << '(' << *MBB->getSymbol() << "-" << *JTISymbol << ")/2";
-    else
-      O << "\tb.w " << *MBB->getSymbol();
-
-    if (i != e-1)
-      O << '\n';
-  }
-}
-
-void ARMAsmPrinter::printTBAddrMode(const MachineInstr *MI, int OpNum,
-                                    raw_ostream &O) {
-  O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg());
-  if (MI->getOpcode() == ARM::t2TBH)
-    O << ", lsl #1";
-  O << ']';
-}
 
-void ARMAsmPrinter::printNoHashImmediate(const MachineInstr *MI, int OpNum,
-                                         raw_ostream &O) {
-  O << MI->getOperand(OpNum).getImm();
-}
-
-void ARMAsmPrinter::printVFPf32ImmOperand(const MachineInstr *MI, int OpNum,
-                                          raw_ostream &O) {
-  const ConstantFP *FP = MI->getOperand(OpNum).getFPImm();
-  O << '#' << FP->getValueAPF().convertToFloat();
-  if (isVerbose()) {
-    O << "\t\t" << MAI->getCommentString() << ' ';
-    WriteAsOperand(O, FP, /*PrintType=*/false);
-  }
-}
-
-void ARMAsmPrinter::printVFPf64ImmOperand(const MachineInstr *MI, int OpNum,
-                                          raw_ostream &O) {
-  const ConstantFP *FP = MI->getOperand(OpNum).getFPImm();
-  O << '#' << FP->getValueAPF().convertToDouble();
-  if (isVerbose()) {
-    O << "\t\t" << MAI->getCommentString() << ' ';
-    WriteAsOperand(O, FP, /*PrintType=*/false);
-  }
-}
-
-void ARMAsmPrinter::printNEONModImmOperand(const MachineInstr *MI, int OpNum,
-                                           raw_ostream &O) {
-  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
-  unsigned EltBits;
-  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
-  O << "#0x" << utohexstr(Val);
+MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel(void) const {
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "SJLJEH"
+    << getFunctionNumber();
+  return OutContext.GetOrCreateSymbol(Name.str());
 }
 
 bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
@@ -1090,14 +285,16 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     default: return true;  // Unknown modifier.
     case 'a': // Print as a memory address.
       if (MI->getOperand(OpNum).isReg()) {
-        O << "[" << getRegisterName(MI->getOperand(OpNum).getReg()) << "]";
+        O << "["
+          << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg())
+          << "]";
         return false;
       }
       // Fallthrough
     case 'c': // Don't print "#" before an immediate operand.
       if (!MI->getOperand(OpNum).isImm())
         return true;
-      printNoHashImmediate(MI, OpNum, O);
+      O << MI->getOperand(OpNum).getImm();
       return false;
     case 'P': // Print a VFP double precision register.
     case 'q': // Print a NEON quad precision register.
@@ -1106,7 +303,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     case 'Q':
     case 'R':
     case 'H':
-      report_fatal_error("llvm does not support 'Q', 'R', and 'H' modifiers!");
+      // These modifiers are not yet supported.
       return true;
     }
   }
@@ -1124,48 +321,10 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
-  O << "[" << getRegisterName(MO.getReg()) << "]";
+  O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
   return false;
 }
 
-void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  if (EnableMCInst) {
-    printInstructionThroughMCStreamer(MI);
-    return;
-  }
-
-  if (MI->getOpcode() == ARM::CONSTPOOL_ENTRY)
-    EmitAlignment(2);
-
-  SmallString<128> Str;
-  raw_svector_ostream OS(Str);
-  if (MI->getOpcode() == ARM::DBG_VALUE) {
-    unsigned NOps = MI->getNumOperands();
-    assert(NOps==4);
-    OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-    // cast away const; DIetc do not take const operands for some reason.
-    DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-    OS << V.getName();
-    OS << " <- ";
-    // Frame address.  Currently handles register +- offset only.
-    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-    OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
-    OS << ']';
-    OS << "+";
-    printOperand(MI, NOps-2, OS);
-    OutStreamer.EmitRawText(OS.str());
-    return;
-  }
-
-  printInstruction(MI, OS);
-  OutStreamer.EmitRawText(OS.str());
-
-  // Make sure the instruction that follows TBB is 2-byte aligned.
-  // FIXME: Constant island pass should insert an "ALIGN" instruction instead.
-  if (MI->getOpcode() == ARM::t2TBB)
-    EmitAlignment(1);
-}
-
 void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (Subtarget->isTargetDarwin()) {
     Reloc::Model RelocM = TM.getRelocationModel();
@@ -1205,49 +364,12 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
   }
 
   // Use unified assembler syntax.
-  OutStreamer.EmitRawText(StringRef("\t.syntax unified"));
+  OutStreamer.EmitAssemblerFlag(MCAF_SyntaxUnified);
 
   // Emit ARM Build Attributes
   if (Subtarget->isTargetELF()) {
-    // CPU Type
-    std::string CPUString = Subtarget->getCPUString();
-    if (CPUString != "generic")
-      OutStreamer.EmitRawText("\t.cpu " + Twine(CPUString));
-
-    // FIXME: Emit FPU type
-    if (Subtarget->hasVFP2())
-      OutStreamer.EmitRawText("\t.eabi_attribute " +
-                              Twine(ARMBuildAttrs::VFP_arch) + ", 2");
-
-    // Signal various FP modes.
-    if (!UnsafeFPMath) {
-      OutStreamer.EmitRawText("\t.eabi_attribute " +
-                              Twine(ARMBuildAttrs::ABI_FP_denormal) + ", 1");
-      OutStreamer.EmitRawText("\t.eabi_attribute " +
-                              Twine(ARMBuildAttrs::ABI_FP_exceptions) + ", 1");
-    }
 
-    if (NoInfsFPMath && NoNaNsFPMath)
-      OutStreamer.EmitRawText("\t.eabi_attribute " +
-                              Twine(ARMBuildAttrs::ABI_FP_number_model)+ ", 1");
-    else
-      OutStreamer.EmitRawText("\t.eabi_attribute " +
-                              Twine(ARMBuildAttrs::ABI_FP_number_model)+ ", 3");
-
-    // 8-bytes alignment stuff.
-    OutStreamer.EmitRawText("\t.eabi_attribute " +
-                            Twine(ARMBuildAttrs::ABI_align8_needed) + ", 1");
-    OutStreamer.EmitRawText("\t.eabi_attribute " +
-                            Twine(ARMBuildAttrs::ABI_align8_preserved) + ", 1");
-
-    // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
-    if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) {
-      OutStreamer.EmitRawText("\t.eabi_attribute " +
-                              Twine(ARMBuildAttrs::ABI_HardFP_use) + ", 3");
-      OutStreamer.EmitRawText("\t.eabi_attribute " +
-                              Twine(ARMBuildAttrs::ABI_VFP_args) + ", 1");
-    }
-    // FIXME: Should we signal R9 usage?
+    emitAttributes();
   }
 }
 
@@ -1280,10 +402,10 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
         else
           // Internal to current translation unit.
           //
-          // When we place the LSDA into the TEXT section, the type info pointers
-          // need to be indirect and pc-rel. We accomplish this by using NLPs.
-          // However, sometimes the types are local to the file. So we need to
-          // fill in the value for the NLP in those cases.
+          // When we place the LSDA into the TEXT section, the type info
+          // pointers need to be indirect and pc-rel. We accomplish this by
+          // using NLPs; however, sometimes the types are local to the file.
+          // We need to fill in the value for the NLP in those cases.
           OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                         OutContext),
                                 4/*size*/, 0/*addrspace*/);
@@ -1321,38 +443,631 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 }
 
 //===----------------------------------------------------------------------===//
+// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+// FIXME:
+// The following seem like one-off assembler flags, but they actually need
+// to appear in the .ARM.attributes section in ELF.
+// Instead of subclassing the MCELFStreamer, we do the work here.
+
+void ARMAsmPrinter::emitAttributes() {
+
+  emitARMAttributeSection();
+
+  AttributeEmitter *AttrEmitter;
+  if (OutStreamer.hasRawTextSupport())
+    AttrEmitter = new AsmAttributeEmitter(OutStreamer);
+  else {
+    MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer);
+    AttrEmitter = new ObjectAttributeEmitter(O);
+  }
+
+  AttrEmitter->MaybeSwitchVendor("aeabi");
+
+  std::string CPUString = Subtarget->getCPUString();
+
+  if (CPUString == "cortex-a8" ||
+      Subtarget->isCortexA8()) {
+    AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8");
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                               ARMBuildAttrs::ApplicationProfile);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::AllowThumb32);
+    // Fixme: figure out when this is emitted.
+    //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch,
+    //                           ARMBuildAttrs::AllowWMMXv1);
+    //
+
+    /// ADD additional Else-cases here!
+  } else if (CPUString == "generic") {
+    // FIXME: Why these defaults?
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::Allowed);
+  }
 
-void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
-  ARMMCInstLower MCInstLowering(OutContext, *Mang, *this);
-  switch (MI->getOpcode()) {
-  case ARM::t2MOVi32imm:
-    assert(0 && "Should be lowered by thumb2it pass");
+  // FIXME: Emit FPU type
+  if (Subtarget->hasVFP2())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
+                               ARMBuildAttrs::AllowFPv2);
+
+  // Signal various FP modes.
+  if (!UnsafeFPMath) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+                               ARMBuildAttrs::Allowed);
+  }
+
+  if (NoInfsFPMath && NoNaNsFPMath)
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                               ARMBuildAttrs::Allowed);
+  else
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                               ARMBuildAttrs::AllowIEE754);
+
+  // FIXME: add more flags to ARMBuildAttrs.h
+  // 8-bytes alignment stuff.
+  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
+  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+
+  // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
+  if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1);
+  }
+  // FIXME: Should we signal R9 usage?
+
+  if (Subtarget->hasDivide())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1);
+
+  AttrEmitter->Finish();
+  delete AttrEmitter;
+}
+
+void ARMAsmPrinter::emitARMAttributeSection() {
+  // <format-version>
+  // [ <section-length> "vendor-name"
+  // [ <file-tag> <size> <attribute>*
+  //   | <section-tag> <size> <section-number>* 0 <attribute>*
+  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+  //   ]+
+  // ]*
+
+  if (OutStreamer.hasRawTextSupport())
+    return;
+
+  const ARMElfTargetObjectFile &TLOFELF =
+    static_cast<const ARMElfTargetObjectFile &>
+    (getObjFileLowering());
+
+  OutStreamer.SwitchSection(TLOFELF.getAttributesSection());
+
+  // Format version
+  OutStreamer.EmitIntValue(0x41, 1);
+}
+
+//===----------------------------------------------------------------------===//
+
+static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber,
+                             unsigned LabelId, MCContext &Ctx) {
+
+  MCSymbol *Label = Ctx.GetOrCreateSymbol(Twine(Prefix)
+                       + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId));
+  return Label;
+}
+
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
+  switch (Modifier) {
+  default: llvm_unreachable("Unknown modifier!");
+  case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None;
+  case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_ARM_TLSGD;
+  case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_ARM_TPOFF;
+  case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_ARM_GOTTPOFF;
+  case ARMCP::GOT:         return MCSymbolRefExpr::VK_ARM_GOT;
+  case ARMCP::GOTOFF:      return MCSymbolRefExpr::VK_ARM_GOTOFF;
+  }
+  return MCSymbolRefExpr::VK_None;
+}
+
+MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
+  bool isIndirect = Subtarget->isTargetDarwin() &&
+    Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+  if (!isIndirect)
+    return Mang->getSymbol(GV);
+
+  // FIXME: Remove this when Darwin transition to @GOT like syntax.
+  MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+  MachineModuleInfoMachO &MMIMachO =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  MachineModuleInfoImpl::StubValueTy &StubSym =
+    GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) :
+    MMIMachO.getGVStubEntry(MCSym);
+  if (StubSym.getPointer() == 0)
+    StubSym = MachineModuleInfoImpl::
+      StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+  return MCSym;
+}
+
+void ARMAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  int Size = TM.getTargetData()->getTypeAllocSize(MCPV->getType());
+
+  ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
+
+  MCSymbol *MCSym;
+  if (ACPV->isLSDA()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+    OS << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
+    MCSym = OutContext.GetOrCreateSymbol(OS.str());
+  } else if (ACPV->isBlockAddress()) {
+    MCSym = GetBlockAddressSymbol(ACPV->getBlockAddress());
+  } else if (ACPV->isGlobalValue()) {
+    const GlobalValue *GV = ACPV->getGV();
+    MCSym = GetARMGVSymbol(GV);
+  } else {
+    assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+    MCSym = GetExternalSymbolSymbol(ACPV->getSymbol());
+  }
+
+  // Create an MCSymbol for the reference.
+  const MCExpr *Expr =
+    MCSymbolRefExpr::Create(MCSym, getModifierVariantKind(ACPV->getModifier()),
+                            OutContext);
+
+  if (ACPV->getPCAdjustment()) {
+    MCSymbol *PCLabel = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                    getFunctionNumber(),
+                                    ACPV->getLabelId(),
+                                    OutContext);
+    const MCExpr *PCRelExpr = MCSymbolRefExpr::Create(PCLabel, OutContext);
+    PCRelExpr =
+      MCBinaryExpr::CreateAdd(PCRelExpr,
+                              MCConstantExpr::Create(ACPV->getPCAdjustment(),
+                                                     OutContext),
+                              OutContext);
+    if (ACPV->mustAddCurrentAddress()) {
+      // We want "(<expr> - .)", but MC doesn't have a concept of the '.'
+      // label, so just emit a local label end reference that instead.
+      MCSymbol *DotSym = OutContext.CreateTempSymbol();
+      OutStreamer.EmitLabel(DotSym);
+      const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+      PCRelExpr = MCBinaryExpr::CreateSub(PCRelExpr, DotExpr, OutContext);
+    }
+    Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, OutContext);
+  }
+  OutStreamer.EmitValue(Expr, Size);
+}
+
+void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  int OpNum = 1;
+  if (Opcode == ARM::BR_JTadd)
+    OpNum = 2;
+  else if (Opcode == ARM::BR_JTm)
+    OpNum = 3;
+
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+
+  // Emit a label for the jump table.
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    // Construct an MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TableBeginAddr)
+    //
+    // For example, a table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .word (LBB0 - LJTI_0_0)
+    //    .word (LBB1 - LJTI_0_0)
+    const MCExpr *Expr = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
+
+    if (TM.getRelocationModel() == Reloc::PIC_)
+      Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol,
+                                                                   OutContext),
+                                     OutContext);
+    OutStreamer.EmitValue(Expr, 4);
+  }
+}
+
+void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  int OpNum = (Opcode == ARM::t2BR_JT) ? 2 : 1;
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+
+  // Emit a label for the jump table.
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  unsigned OffsetWidth = 4;
+  if (MI->getOpcode() == ARM::t2TBB_JT)
+    OffsetWidth = 1;
+  else if (MI->getOpcode() == ARM::t2TBH_JT)
+    OffsetWidth = 2;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(),
+                                                      OutContext);
+    // If this isn't a TBB or TBH, the entries are direct branch instructions.
+    if (OffsetWidth == 4) {
+      MCInst BrInst;
+      BrInst.setOpcode(ARM::t2B);
+      BrInst.addOperand(MCOperand::CreateExpr(MBBSymbolExpr));
+      OutStreamer.EmitInstruction(BrInst);
+      continue;
+    }
+    // Otherwise it's an offset from the dispatch instruction. Construct an
+    // MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TableBeginAddr) / 2
+    //
+    // For example, a TBB table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .byte (LBB0 - LJTI_0_0) / 2
+    //    .byte (LBB1 - LJTI_0_0) / 2
+    const MCExpr *Expr =
+      MCBinaryExpr::CreateSub(MBBSymbolExpr,
+                              MCSymbolRefExpr::Create(JTISymbol, OutContext),
+                              OutContext);
+    Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(2, OutContext),
+                                   OutContext);
+    OutStreamer.EmitValue(Expr, OffsetWidth);
+  }
+}
+
+void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                           raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps==4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps-2, OS);
+}
+
+static void populateADROperands(MCInst &Inst, unsigned Dest,
+                                const MCSymbol *Label,
+                                unsigned pred, unsigned ccreg,
+                                MCContext &Ctx) {
+  const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, Ctx);
+  Inst.addOperand(MCOperand::CreateReg(Dest));
+  Inst.addOperand(MCOperand::CreateExpr(SymbolExpr));
+  // Add predicate operands.
+  Inst.addOperand(MCOperand::CreateImm(pred));
+  Inst.addOperand(MCOperand::CreateReg(ccreg));
+}
+
+void ARMAsmPrinter::EmitPatchedInstruction(const MachineInstr *MI,
+                                           unsigned Opcode) {
+  MCInst TmpInst;
+
+  // Emit the instruction as usual, just patch the opcode.
+  LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+  TmpInst.setOpcode(Opcode);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
   default: break;
-  case ARM::PICADD: { // FIXME: Remove asm string from td file.
+  case ARM::t2ADDrSPi:
+  case ARM::t2ADDrSPi12:
+  case ARM::t2SUBrSPi:
+  case ARM::t2SUBrSPi12:
+    assert ((MI->getOperand(1).getReg() == ARM::SP) &&
+            "Unexpected source register!");
+    break;
+
+  case ARM::t2MOVi32imm: assert(0 && "Should be lowered by thumb2it pass");
+  case ARM::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  case ARM::tBfar: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::tBL);
+    TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+          MI->getOperand(0).getMBB()->getSymbol(), OutContext)));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::LEApcrel:
+  case ARM::tLEApcrel:
+  case ARM::t2LEApcrel: {
+    // FIXME: Need to also handle globals and externals
+    MCInst TmpInst;
+    TmpInst.setOpcode(MI->getOpcode() == ARM::t2LEApcrel ? ARM::t2ADR
+                      : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR
+                         : ARM::ADR));
+    populateADROperands(TmpInst, MI->getOperand(0).getReg(),
+                        GetCPISymbol(MI->getOperand(1).getIndex()),
+                        MI->getOperand(2).getImm(), MI->getOperand(3).getReg(),
+                        OutContext);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::LEApcrelJT:
+  case ARM::tLEApcrelJT:
+  case ARM::t2LEApcrelJT: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(MI->getOpcode() == ARM::t2LEApcrelJT ? ARM::t2ADR
+                      : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR
+                         : ARM::ADR));
+    populateADROperands(TmpInst, MI->getOperand(0).getReg(),
+                      GetARMJTIPICJumpTableLabel2(MI->getOperand(1).getIndex(),
+                                                  MI->getOperand(2).getImm()),
+                      MI->getOperand(3).getImm(), MI->getOperand(4).getReg(),
+                      OutContext);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::MOVPCRX: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVr);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::BXr9_CALL:
+  case ARM::BX_CALL: {
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVr);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      // Add predicate operands.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // Add 's' bit operand (always reg0 for this)
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::BX);
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    return;
+  }
+  case ARM::BMOVPCRXr9_CALL:
+  case ARM::BMOVPCRX_CALL: {
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVr);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      // Add predicate operands.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // Add 's' bit operand (always reg0 for this)
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVr);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      // Add predicate operands.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // Add 's' bit operand (always reg0 for this)
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    return;
+  }
+  case ARM::MOVi16_ga_pcrel:
+  case ARM::t2MOVi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+
+    unsigned TF = MI->getOperand(1).getTargetFlags();
+    bool isPIC = TF == ARMII::MO_LO16_NONLAZY_PIC;
+    const GlobalValue *GV = MI->getOperand(1).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    if (isPIC) {
+      MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                       getFunctionNumber(),
+                                       MI->getOperand(2).getImm(), OutContext);
+      const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+      unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
+      const MCExpr *PCRelExpr =
+        ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr,
+                                  MCBinaryExpr::CreateAdd(LabelSymExpr,
+                                      MCConstantExpr::Create(PCAdj, OutContext),
+                                          OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
+    } else {
+      const MCExpr *RefExpr= ARMMCExpr::CreateLower16(GVSymExpr, OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(RefExpr));
+    }
+
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::MOVTi16_ga_pcrel:
+  case ARM::t2MOVTi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel
+                      ? ARM::MOVTi16 : ARM::t2MOVTi16);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+
+    unsigned TF = MI->getOperand(2).getTargetFlags();
+    bool isPIC = TF == ARMII::MO_HI16_NONLAZY_PIC;
+    const GlobalValue *GV = MI->getOperand(2).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    if (isPIC) {
+      MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                       getFunctionNumber(),
+                                       MI->getOperand(3).getImm(), OutContext);
+      const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+      unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
+      const MCExpr *PCRelExpr =
+        ARMMCExpr::CreateUpper16(MCBinaryExpr::CreateSub(GVSymExpr,
+                                   MCBinaryExpr::CreateAdd(LabelSymExpr,
+                                      MCConstantExpr::Create(PCAdj, OutContext),
+                                          OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
+    } else {
+      const MCExpr *RefExpr= ARMMCExpr::CreateUpper16(GVSymExpr, OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(RefExpr));
+    }
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::tPICADD: {
     // This is a pseudo op for a label + instruction sequence, which looks like:
     // LPC0:
-    //     add r0, pc, r0
+    //     add r0, pc
     // This adds the address of LPC0 to r0.
 
     // Emit the label.
-    // FIXME: MOVE TO SHARED PLACE.
-    unsigned Id = (unsigned)MI->getOperand(2).getImm();
-    const char *Prefix = MAI->getPrivateGlobalPrefix();
-    MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix)
-                         + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id));
-    OutStreamer.EmitLabel(Label);
+    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
 
+    // Form and emit the add.
+    MCInst AddInst;
+    AddInst.setOpcode(ARM::tADDhirr);
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    AddInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    // Add predicate operands.
+    AddInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    AddInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(AddInst);
+    return;
+  }
+  case ARM::PICADD: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc, r0
+    // This adds the address of LPC0 to r0.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
 
-    // Form and emit tha dd.
+    // Form and emit the add.
     MCInst AddInst;
     AddInst.setOpcode(ARM::ADDrr);
     AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
     AddInst.addOperand(MCOperand::CreateReg(ARM::PC));
     AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    // Add predicate operands.
+    AddInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(4).getReg()));
+    // Add 's' bit operand (always reg0 for this)
+    AddInst.addOperand(MCOperand::CreateReg(0));
     OutStreamer.EmitInstruction(AddInst);
     return;
   }
-  case ARM::CONSTPOOL_ENTRY: { // FIXME: Remove asm string from td file.
+  case ARM::PICSTR:
+  case ARM::PICSTRB:
+  case ARM::PICSTRH:
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICLDRH:
+  case ARM::PICLDRSB:
+  case ARM::PICLDRSH: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     OP r0, [pc, r0]
+    // The LCP0 label is referenced by a constant pool entry in order to get
+    // a PC-relative address at the ldr instruction.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
+
+    // Form and emit the load
+    unsigned Opcode;
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case ARM::PICSTR:   Opcode = ARM::STRrs; break;
+    case ARM::PICSTRB:  Opcode = ARM::STRBrs; break;
+    case ARM::PICSTRH:  Opcode = ARM::STRH; break;
+    case ARM::PICLDR:   Opcode = ARM::LDRrs; break;
+    case ARM::PICLDRB:  Opcode = ARM::LDRBrs; break;
+    case ARM::PICLDRH:  Opcode = ARM::LDRH; break;
+    case ARM::PICLDRSB: Opcode = ARM::LDRSB; break;
+    case ARM::PICLDRSH: Opcode = ARM::LDRSH; break;
+    }
+    MCInst LdStInst;
+    LdStInst.setOpcode(Opcode);
+    LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    LdStInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    LdStInst.addOperand(MCOperand::CreateImm(0));
+    // Add predicate operands.
+    LdStInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
+    LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(4).getReg()));
+    OutStreamer.EmitInstruction(LdStInst);
+
+    return;
+  }
+  case ARM::CONSTPOOL_ENTRY: {
     /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
     /// in the function.  The first operand is the ID# for this instruction, the
     /// second is the index into the MachineConstantPool that this is, the third
@@ -1371,100 +1086,450 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
 
     return;
   }
-  case ARM::MOVi2pieces: { // FIXME: Remove asmstring from td file.
-    // This is a hack that lowers as a two instruction sequence.
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
+  case ARM::t2BR_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::tMOVgpr2gpr);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    return;
+  }
+  case ARM::t2TBB_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    MCInst TmpInst;
+
+    TmpInst.setOpcode(ARM::t2TBB);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    // Make sure the next instruction is 2-byte aligned.
+    EmitAlignment(1);
+    return;
+  }
+  case ARM::t2TBH_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    MCInst TmpInst;
+
+    TmpInst.setOpcode(ARM::t2TBH);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    return;
+  }
+  case ARM::tBR_JTr:
+  case ARM::BR_JTr: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // mov pc, target
+    MCInst TmpInst;
+    unsigned Opc = MI->getOpcode() == ARM::BR_JTr ?
+      ARM::MOVr : ARM::tMOVgpr2gpr;
+    TmpInst.setOpcode(Opc);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    if (Opc == ARM::MOVr)
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+
+    // Make sure the Thumb jump table is 4-byte aligned.
+    if (Opc == ARM::tMOVgpr2gpr)
+      EmitAlignment(2);
 
-    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
-    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::BR_JTm: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // ldr pc, target
+    MCInst TmpInst;
+    if (MI->getOperand(1).getReg() == 0) {
+      // literal offset
+      TmpInst.setOpcode(ARM::LDRi12);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+    } else {
+      TmpInst.setOpcode(ARM::LDRrs);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
+    }
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
 
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::BR_JTadd: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // add pc, target, idx
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::ADDrr);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::TRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetDarwin()) {
+      //.long 0xe7ffdefe @ trap
+      uint32_t Val = 0xe7ffdefeUL;
+      OutStreamer.AddComment("trap");
+      OutStreamer.EmitIntValue(Val, 4);
+      return;
+    }
+    break;
+  }
+  case ARM::tTRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetDarwin()) {
+      //.short 57086 @ trap
+      uint16_t Val = 0xdefe;
+      OutStreamer.AddComment("trap");
+      OutStreamer.EmitIntValue(Val, 2);
+      return;
+    }
+    break;
+  }
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+  case ARM::tInt_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // mov $val, pc
+    // adds $val, #7
+    // str $val, [$src, #4]
+    // movs r0, #0
+    // b 1f
+    // movs r0, #1
+    // 1:
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+    MCSymbol *Label = GetARMSJLJEHLabel();
     {
       MCInst TmpInst;
-      TmpInst.setOpcode(ARM::MOVi);
-      TmpInst.addOperand(MCOperand::CreateReg(DstReg));
-      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV1));
-
+      TmpInst.setOpcode(ARM::tMOVgpr2tgpr);
+      TmpInst.addOperand(MCOperand::CreateReg(ValReg));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      // 's' bit operand
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+      OutStreamer.AddComment("eh_setjmp begin");
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tADDi3);
+      TmpInst.addOperand(MCOperand::CreateReg(ValReg));
+      // 's' bit operand
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+      TmpInst.addOperand(MCOperand::CreateReg(ValReg));
+      TmpInst.addOperand(MCOperand::CreateImm(7));
       // Predicate.
-      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
-      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
-
-      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
       OutStreamer.EmitInstruction(TmpInst);
     }
-
     {
       MCInst TmpInst;
-      TmpInst.setOpcode(ARM::ORRri);
-      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // dstreg
-      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // inreg
-      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV2)); // so_imm
+      TmpInst.setOpcode(ARM::tSTRi);
+      TmpInst.addOperand(MCOperand::CreateReg(ValReg));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      // The offset immediate is #4. The operand value is scaled by 4 for the
+      // tSTR instruction.
+      TmpInst.addOperand(MCOperand::CreateImm(1));
       // Predicate.
-      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
-      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVi8);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::R0));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, OutContext);
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tB);
+      TmpInst.addOperand(MCOperand::CreateExpr(SymbolExpr));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVi8);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::R0));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+      TmpInst.addOperand(MCOperand::CreateImm(1));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.AddComment("eh_setjmp end");
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    OutStreamer.EmitLabel(Label);
+    return;
+  }
 
-      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+  case ARM::Int_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // add $val, pc, #8
+    // str $val, [$src, #+4]
+    // mov r0, #0
+    // add pc, pc, #0
+    // mov r0, #1
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::ADDri);
+      TmpInst.addOperand(MCOperand::CreateReg(ValReg));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateImm(8));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // 's' bit operand (always reg0 for this).
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.AddComment("eh_setjmp begin");
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::STRi12);
+      TmpInst.addOperand(MCOperand::CreateReg(ValReg));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      TmpInst.addOperand(MCOperand::CreateImm(4));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::R0));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // 's' bit operand (always reg0 for this).
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::ADDri);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // 's' bit operand (always reg0 for this).
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::R0));
+      TmpInst.addOperand(MCOperand::CreateImm(1));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // 's' bit operand (always reg0 for this).
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.AddComment("eh_setjmp end");
       OutStreamer.EmitInstruction(TmpInst);
     }
     return;
   }
-  case ARM::MOVi32imm: { // FIXME: Remove asmstring from td file.
-    // This is a hack that lowers as a two instruction sequence.
-    unsigned DstReg = MI->getOperand(0).getReg();
-    const MachineOperand &MO = MI->getOperand(1);
-    MCOperand V1, V2;
-    if (MO.isImm()) {
-      unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
-      V1 = MCOperand::CreateImm(ImmVal & 65535);
-      V2 = MCOperand::CreateImm(ImmVal >> 16);
-    } else if (MO.isGlobal()) {
-      MCSymbol *Symbol = MCInstLowering.GetGlobalAddressSymbol(MO);
-      const MCSymbolRefExpr *SymRef1 =
-        MCSymbolRefExpr::Create(Symbol,
-                                MCSymbolRefExpr::VK_ARM_LO16, OutContext);
-      const MCSymbolRefExpr *SymRef2 =
-        MCSymbolRefExpr::Create(Symbol,
-                                MCSymbolRefExpr::VK_ARM_HI16, OutContext);
-      V1 = MCOperand::CreateExpr(SymRef1);
-      V2 = MCOperand::CreateExpr(SymRef2);
-    } else {
-      MI->dump();
-      llvm_unreachable("cannot handle this operand");
+  case ARM::Int_eh_sjlj_longjmp: {
+    // ldr sp, [$src, #8]
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::LDRi12);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::SP));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      TmpInst.addOperand(MCOperand::CreateImm(8));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
     }
-
     {
       MCInst TmpInst;
-      TmpInst.setOpcode(ARM::MOVi16);
-      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
-      TmpInst.addOperand(V1); // lower16(imm)
-
+      TmpInst.setOpcode(ARM::LDRi12);
+      TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      TmpInst.addOperand(MCOperand::CreateImm(4));
       // Predicate.
-      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
-      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
-
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
       OutStreamer.EmitInstruction(TmpInst);
     }
-
     {
       MCInst TmpInst;
-      TmpInst.setOpcode(ARM::MOVTi16);
-      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
-      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // srcreg
-      TmpInst.addOperand(V2);   // upper16(imm)
-
+      TmpInst.setOpcode(ARM::LDRi12);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::R7));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
       // Predicate.
-      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
-      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
-
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::BX);
+      TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
       OutStreamer.EmitInstruction(TmpInst);
     }
-
     return;
   }
+  case ARM::tInt_eh_sjlj_longjmp: {
+    // ldr $scratch, [$src, #8]
+    // mov sp, $scratch
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tLDRi);
+      TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      // The offset immediate is #8. The operand value is scaled by 4 for the
+      // tLDR instruction.
+      TmpInst.addOperand(MCOperand::CreateImm(2));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVtgpr2gpr);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::SP));
+      TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tLDRi);
+      TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      TmpInst.addOperand(MCOperand::CreateImm(1));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tLDRr);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::R7));
+      TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tBX_RET_vararg);
+      TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    return;
+  }
+  // These are the pseudos created to comply with stricter operand restrictions
+  // on ARMv5. Lower them now to "normal" instructions, since all the
+  // restrictions are already satisfied.
+  case ARM::MULv5:
+    EmitPatchedInstruction(MI, ARM::MUL);
+    return;
+  case ARM::MLAv5:
+    EmitPatchedInstruction(MI, ARM::MLA);
+    return;
+  case ARM::SMULLv5:
+    EmitPatchedInstruction(MI, ARM::SMULL);
+    return;
+  case ARM::UMULLv5:
+    EmitPatchedInstruction(MI, ARM::UMULL);
+    return;
+  case ARM::SMLALv5:
+    EmitPatchedInstruction(MI, ARM::SMLAL);
+    return;
+  case ARM::UMLALv5:
+    EmitPatchedInstruction(MI, ARM::UMLAL);
+    return;
+  case ARM::UMAALv5:
+    EmitPatchedInstruction(MI, ARM::UMAAL);
+    return;
   }
 
   MCInst TmpInst;
-  MCInstLowering.Lower(MI, TmpInst);
+  LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
   OutStreamer.EmitInstruction(TmpInst);
 }
 
@@ -1476,7 +1541,7 @@ static MCInstPrinter *createARMMCInstPrinter(const Target &T,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI) {
   if (SyntaxVariant == 0)
-    return new ARMInstPrinter(MAI, false);
+    return new ARMInstPrinter(MAI);
   return 0;
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
new file mode 100644
index 0000000..5852684
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -0,0 +1,112 @@
+//===-- ARMAsmPrinter.h - Print machine code to an ARM .s file ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// ARM Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMASMPRINTER_H
+#define ARMASMPRINTER_H
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+namespace ARM {
+  enum DW_ISA {
+    DW_ISA_ARM_thumb = 1,
+    DW_ISA_ARM_arm = 2
+  };
+}
+
+class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const ARMSubtarget *Subtarget;
+
+  /// AFI - Keep a pointer to ARMFunctionInfo for the current
+  /// MachineFunction.
+  ARMFunctionInfo *AFI;
+
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+public:
+  explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    }
+
+  virtual const char *getPassName() const {
+    return "ARM Assembly Printer";
+  }
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = 0);
+
+  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O);
+  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                                     unsigned AsmVariant,
+                                     const char *ExtraCode, raw_ostream &O);
+
+  void EmitJumpTable(const MachineInstr *MI);
+  void EmitJump2Table(const MachineInstr *MI);
+  virtual void EmitInstruction(const MachineInstr *MI);
+  bool runOnMachineFunction(MachineFunction &F);
+
+  virtual void EmitConstantPool() {} // we emit constant pools customly!
+  virtual void EmitFunctionEntryLabel();
+  void EmitStartOfAsmFile(Module &M);
+  void EmitEndOfAsmFile(Module &M);
+
+private:
+  // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+  void emitAttributes();
+
+  // Helper for ELF .o only
+  void emitARMAttributeSection();
+
+  // Generic helper used to emit e.g. ARMv5 mul pseudos
+  void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc);
+
+public:
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+
+  virtual unsigned getISAEncoding() {
+    // ARM/Darwin adds ISA to the DWARF info for each function.
+    if (!Subtarget->isTargetDarwin())
+      return 0;
+    return Subtarget->isThumb() ?
+      llvm::ARM::DW_ISA_ARM_thumb : llvm::ARM::DW_ISA_ARM_arm;
+  }
+
+  MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
+                                        const MachineBasicBlock *MBB) const;
+  MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const;
+
+  MCSymbol *GetARMSJLJEHLabel(void) const;
+
+  MCSymbol *GetARMGVSymbol(const GlobalValue *GV);
+  
+  /// EmitMachineConstantPoolValue - Print a machine constantpool value to
+  /// the .s file.
+  virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInfo.h
new file mode 100644
index 0000000..a56cc1a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInfo.h
@@ -0,0 +1,249 @@
+//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINFO_H
+#define ARMBASEINFO_H
+
+#include "llvm/Support/ErrorHandling.h"
+
+// Note that the following auto-generated files only defined enum types, and
+// so are safe to include here.
+
+// Defines symbolic names for ARM registers.  This defines a mapping from
+// register name to register number.
+//
+#include "ARMGenRegisterNames.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#include "ARMGenInstrNames.inc"
+
+namespace llvm {
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
+    EQ,            // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Carry set                  >, ==, or unordered
+    LO,            // Carry clear                Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Not unordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Greater than               Greater than
+    LE,            // Less than or equal         <, ==, or unordered
+    AL             // Always (unconditional)     Always (unconditional)
+  };
+
+  inline static CondCodes getOppositeCondition(CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+} // namespace ARMCC
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code");
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+}
+
+namespace ARM_PROC {
+  enum IMod {
+    IE = 2,
+    ID = 3
+  };
+
+  enum IFlags {
+    F = 1,
+    I = 2,
+    A = 4
+  };
+
+  inline static const char *IFlagsToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown iflags operand");
+    case F: return "f";
+    case I: return "i";
+    case A: return "a";
+    }
+  }
+
+  inline static const char *IModToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown imod operand");
+    case IE: return "ie";
+    case ID: return "id";
+    }
+  }
+}
+
+namespace ARM_MB {
+  // The Memory Barrier Option constants map directly to the 4-bit encoding of
+  // the option field for memory barrier operations.
+  enum MemBOpt {
+    SY    = 15,
+    ST    = 14,
+    ISH   = 11,
+    ISHST = 10,
+    NSH   = 7,
+    NSHST = 6,
+    OSH   = 3,
+    OSHST = 2
+  };
+
+  inline static const char *MemBOptToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown memory operation");
+    case SY:    return "sy";
+    case ST:    return "st";
+    case ISH:   return "ish";
+    case ISHST: return "ishst";
+    case NSH:   return "nsh";
+    case NSHST: return "nshst";
+    case OSH:   return "osh";
+    case OSHST: return "oshst";
+    }
+  }
+} // namespace ARM_MB
+
+/// getARMRegisterNumbering - Given the enum value for some register, e.g.
+/// ARM::LR, return the number that it corresponds to (e.g. 14).
+inline static unsigned getARMRegisterNumbering(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  default:
+    llvm_unreachable("Unknown ARM register!");
+  case R0:  case S0:  case D0:  case Q0:  return 0;
+  case R1:  case S1:  case D1:  case Q1:  return 1;
+  case R2:  case S2:  case D2:  case Q2:  return 2;
+  case R3:  case S3:  case D3:  case Q3:  return 3;
+  case R4:  case S4:  case D4:  case Q4:  return 4;
+  case R5:  case S5:  case D5:  case Q5:  return 5;
+  case R6:  case S6:  case D6:  case Q6:  return 6;
+  case R7:  case S7:  case D7:  case Q7:  return 7;
+  case R8:  case S8:  case D8:  case Q8:  return 8;
+  case R9:  case S9:  case D9:  case Q9:  return 9;
+  case R10: case S10: case D10: case Q10: return 10;
+  case R11: case S11: case D11: case Q11: return 11;
+  case R12: case S12: case D12: case Q12: return 12;
+  case SP:  case S13: case D13: case Q13: return 13;
+  case LR:  case S14: case D14: case Q14: return 14;
+  case PC:  case S15: case D15: case Q15: return 15;
+
+  case S16: case D16: return 16;
+  case S17: case D17: return 17;
+  case S18: case D18: return 18;
+  case S19: case D19: return 19;
+  case S20: case D20: return 20;
+  case S21: case D21: return 21;
+  case S22: case D22: return 22;
+  case S23: case D23: return 23;
+  case S24: case D24: return 24;
+  case S25: case D25: return 25;
+  case S26: case D26: return 26;
+  case S27: case D27: return 27;
+  case S28: case D28: return 28;
+  case S29: case D29: return 29;
+  case S30: case D30: return 30;
+  case S31: case D31: return 31;
+  }
+}
+
+namespace ARMII {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_LO16 - On a symbol operand, this represents a relocation containing
+    /// lower 16 bit of the address. Used only via movw instruction.
+    MO_LO16,
+
+    /// MO_HI16 - On a symbol operand, this represents a relocation containing
+    /// higher 16 bit of the address. Used only via movt instruction.
+    MO_HI16,
+
+    /// MO_LO16_NONLAZY - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
+    /// i.e. "FOO$non_lazy_ptr".
+    /// Used only via movw instruction.
+    MO_LO16_NONLAZY,
+
+    /// MO_HI16_NONLAZY - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
+    /// i.e. "FOO$non_lazy_ptr". Used only via movt instruction.
+    MO_HI16_NONLAZY,
+
+    /// MO_LO16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the PC relative address of the
+    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
+    /// Used only via movw instruction.
+    MO_LO16_NONLAZY_PIC,
+
+    /// MO_HI16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the PC relative address of the
+    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
+    /// Used only via movt instruction.
+    MO_HI16_NONLAZY_PIC,
+
+    /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
+    /// call operand.
+    MO_PLT
+  };
+} // end namespace ARMII
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e4f10f9..2268e59 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -15,13 +15,13 @@
 #include "ARM.h"
 #include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
 #include "ARMGenInstrInfo.inc"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalValue.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -34,15 +34,75 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 static cl::opt<bool>
 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
 
+/// ARM_MLxEntry - Record information about MLA / MLS instructions.
+struct ARM_MLxEntry {
+  unsigned MLxOpc;     // MLA / MLS opcode
+  unsigned MulOpc;     // Expanded multiplication opcode
+  unsigned AddSubOpc;  // Expanded add / sub opcode
+  bool NegAcc;         // True if the acc is negated before the add / sub.
+  bool HasLane;        // True if instruction has an extra "lane" operand.
+};
+
+static const ARM_MLxEntry ARM_MLxTable[] = {
+  // MLxOpc,          MulOpc,           AddSubOpc,       NegAcc, HasLane
+  // fp scalar ops
+  { ARM::VMLAS,       ARM::VMULS,       ARM::VADDS,      false,  false },
+  { ARM::VMLSS,       ARM::VMULS,       ARM::VSUBS,      false,  false },
+  { ARM::VMLAD,       ARM::VMULD,       ARM::VADDD,      false,  false },
+  { ARM::VMLSD,       ARM::VMULD,       ARM::VSUBD,      false,  false },
+  { ARM::VNMLAS,      ARM::VNMULS,      ARM::VSUBS,      true,   false },
+  { ARM::VNMLSS,      ARM::VMULS,       ARM::VSUBS,      true,   false },
+  { ARM::VNMLAD,      ARM::VNMULD,      ARM::VSUBD,      true,   false },
+  { ARM::VNMLSD,      ARM::VMULD,       ARM::VSUBD,      true,   false },
+
+  // fp SIMD ops
+  { ARM::VMLAfd,      ARM::VMULfd,      ARM::VADDfd,     false,  false },
+  { ARM::VMLSfd,      ARM::VMULfd,      ARM::VSUBfd,     false,  false },
+  { ARM::VMLAfq,      ARM::VMULfq,      ARM::VADDfq,     false,  false },
+  { ARM::VMLSfq,      ARM::VMULfq,      ARM::VSUBfq,     false,  false },
+  { ARM::VMLAslfd,    ARM::VMULslfd,    ARM::VADDfd,     false,  true  },
+  { ARM::VMLSslfd,    ARM::VMULslfd,    ARM::VSUBfd,     false,  true  },
+  { ARM::VMLAslfq,    ARM::VMULslfq,    ARM::VADDfq,     false,  true  },
+  { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
+};
+
 ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
   : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)),
     Subtarget(STI) {
+  for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
+    if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
+      assert(false && "Duplicated entries?");
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
+  }
+}
+
+// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl
+// currently defaults to no prepass hazard recognizer.
+ScheduleHazardRecognizer *ARMBaseInstrInfo::
+CreateTargetHazardRecognizer(const TargetMachine *TM,
+                             const ScheduleDAG *DAG) const {
+  if (usePreRAHazardRecognizer()) {
+    const InstrItineraryData *II = TM->getInstrItineraryData();
+    return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
+  }
+  return TargetInstrInfoImpl::CreateTargetHazardRecognizer(TM, DAG);
+}
+
+ScheduleHazardRecognizer *ARMBaseInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                   const ScheduleDAG *DAG) const {
+  if (Subtarget.isThumb2() || Subtarget.hasVFP2())
+    return (ScheduleHazardRecognizer *)
+      new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget, DAG);
+  return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II, DAG);
 }
 
 MachineInstr *
@@ -140,7 +200,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (isLoad)
       MemMI = BuildMI(MF, MI->getDebugLoc(),
                       get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+        .addReg(WBReg).addImm(0).addImm(Pred);
     else
       MemMI = BuildMI(MF, MI->getDebugLoc(),
                       get(MemOpc)).addReg(MI->getOperand(1).getReg())
@@ -151,7 +211,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (isLoad)
       MemMI = BuildMI(MF, MI->getDebugLoc(),
                       get(MemOpc), MI->getOperand(0).getReg())
-        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+        .addReg(BaseReg).addImm(0).addImm(Pred);
     else
       MemMI = BuildMI(MF, MI->getDebugLoc(),
                       get(MemOpc)).addReg(MI->getOperand(1).getReg())
@@ -166,8 +226,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   if (LV) {
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (MO.isReg() && MO.getReg() &&
-          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
         unsigned Reg = MO.getReg();
 
         LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
@@ -197,43 +256,6 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   return NewMIs[0];
 }
 
-bool
-ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
-
-  DebugLoc DL;
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
-
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    bool isKill = true;
-
-    // Add the callee-saved register as live-in unless it's LR and
-    // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress
-    // then it's already added to the function and entry block live-in sets.
-    if (Reg == ARM::LR) {
-      MachineFunction &MF = *MBB.getParent();
-      if (MF.getFrameInfo()->isReturnAddressTaken() &&
-          MF.getRegInfo().isLiveIn(Reg))
-        isKill = false;
-    }
-
-    if (isKill)
-      MBB.addLiveIn(Reg);
-
-    // Insert the spill to the stack frame. The register is killed at the spill
-    // 
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    storeRegToStackSlot(MBB, MI, Reg, isKill,
-                        CSI[i].getFrameIdx(), RC, TRI);
-  }
-  return true;
-}
-
 // Branch analysis.
 bool
 ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
@@ -275,13 +297,31 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
   // Get the instruction before it if it is a terminator.
   MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
 
   // If there are three terminators, we don't know what sort of block this is.
   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
     TBB =  SecondLastInst->getOperand(0).getMBB();
     Cond.push_back(SecondLastInst->getOperand(1));
@@ -468,7 +508,7 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
 }
 
 /// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
-DISABLE_INLINE
+LLVM_ATTRIBUTE_NOINLINE
 static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
                                 unsigned JTI);
 static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
@@ -513,6 +553,14 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case ARMII::Size2Bytes: return 2;          // Thumb1 instruction.
   case ARMII::SizeSpecial: {
     switch (Opc) {
+    case ARM::MOVi16_ga_pcrel:
+    case ARM::MOVTi16_ga_pcrel:
+    case ARM::t2MOVi16_ga_pcrel:
+    case ARM::t2MOVTi16_ga_pcrel:
+      return 4;
+    case ARM::MOVi32imm:
+    case ARM::t2MOVi32imm:
+      return 8;
     case ARM::CONSTPOOL_ENTRY:
       // If this machine instr is a constant pool entry, its size is recorded as
       // operand #2.
@@ -533,13 +581,13 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     case ARM::BR_JTadd:
     case ARM::tBR_JTr:
     case ARM::t2BR_JT:
-    case ARM::t2TBB:
-    case ARM::t2TBH: {
+    case ARM::t2TBB_JT:
+    case ARM::t2TBH_JT: {
       // These are jumptable branches, i.e. a branch followed by an inlined
       // jumptable. The size is 4 + 4 * number of entries. For TBB, each
       // entry is one byte; TBH two byte each.
-      unsigned EntrySize = (Opc == ARM::t2TBB)
-        ? 1 : ((Opc == ARM::t2TBH) ? 2 : 4);
+      unsigned EntrySize = (Opc == ARM::t2TBB_JT)
+        ? 1 : ((Opc == ARM::t2TBH_JT) ? 2 : 4);
       unsigned NumOps = TID.getNumOperands();
       MachineOperand JTOP =
         MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2));
@@ -557,7 +605,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       // alignment issue.
       unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
       unsigned NumEntries = getNumJTEntries(JT, JTI);
-      if (Opc == ARM::t2TBB && (NumEntries & 1))
+      if (Opc == ARM::t2TBB_JT && (NumEntries & 1))
         // Make sure the instruction that follows TBB is 2-byte aligned.
         // FIXME: Constant island pass should insert an "ALIGN" instruction
         // instead.
@@ -573,84 +621,6 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   return 0; // Not reached
 }
 
-unsigned
-ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::LDR:
-  case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  case ARM::t2LDRi12:
-  case ARM::tRestore:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  case ARM::VLDRD:
-  case ARM::VLDRS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-
-  return 0;
-}
-
-unsigned
-ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                     int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::STR:
-  case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isReg() &&
-        MI->getOperand(3).isImm() &&
-        MI->getOperand(2).getReg() == 0 &&
-        MI->getOperand(3).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  case ARM::t2STRi12:
-  case ARM::tSpill:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  case ARM::VSTRD:
-  case ARM::VSTRS:
-    if (MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-
-  return 0;
-}
-
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I, DebugLoc DL,
                                    unsigned DestReg, unsigned SrcReg,
@@ -715,8 +685,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   unsigned Align = MFI.getObjectAlignment(FI);
 
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
-                            MachineMemOperand::MOStore, 0,
+    MF.getMachineMemOperand(MachinePointerInfo(
+                                         PseudoSourceValue::getFixedStack(FI)),
+                            MachineMemOperand::MOStore,
                             MFI.getObjectSize(FI),
                             Align);
 
@@ -728,9 +699,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   switch (RC->getID()) {
   case ARM::GPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
                    .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
     break;
   case ARM::SPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
@@ -747,17 +718,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   case ARM::QPRRegClassID:
   case ARM::QPR_VFP2RegClassID:
   case ARM::QPR_8RegClassID:
-    // FIXME: Neon instructions should support predicates
-    if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q))
+    if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64Pseudo))
                      .addFrameIndex(FI).addImm(16)
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addMemOperand(MMO));
     } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ))
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
                      .addMemOperand(MMO));
     }
     break;
@@ -766,18 +735,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
       // FIXME: It's possible to only store part of the QQ register if the
       // spilled def has a sub-register index.
-      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST1d64Q))
-        .addFrameIndex(FI).addImm(16);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-      AddDefaultPred(MIB.addMemOperand(MMO));
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
     } else {
       MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD))
-                       .addFrameIndex(FI)
-                       .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
         .addMemOperand(MMO);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
@@ -787,9 +752,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
   case ARM::QQQQPRRegClassID: {
     MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD))
-                     .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                     .addFrameIndex(FI))
       .addMemOperand(MMO);
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
@@ -806,6 +770,53 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
+unsigned
+ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                     int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STRrs:
+  case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::STRi12:
+  case ARM::t2STRi12:
+  case ARM::tSpill:
+  case ARM::VSTRD:
+  case ARM::VSTRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VST1q64Pseudo:
+    if (MI->getOperand(0).isFI() &&
+        MI->getOperand(2).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+    break;
+  case ARM::VSTMQIA:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
 void ARMBaseInstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
@@ -817,8 +828,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
-                            MachineMemOperand::MOLoad, 0,
+    MF.getMachineMemOperand(
+                    MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                            MachineMemOperand::MOLoad,
                             MFI.getObjectSize(FI),
                             Align);
 
@@ -830,8 +842,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   switch (RC->getID()) {
   case ARM::GPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
-                   .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
     break;
   case ARM::SPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
@@ -846,31 +858,26 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   case ARM::QPRRegClassID:
   case ARM::QPR_VFP2RegClassID:
   case ARM::QPR_8RegClassID:
-    if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg)
+    if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64Pseudo), DestReg)
                      .addFrameIndex(FI).addImm(16)
                      .addMemOperand(MMO));
     } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg)
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
                      .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
                      .addMemOperand(MMO));
     }
     break;
   case ARM::QQPRRegClassID:
   case ARM::QQPR_VFP2RegClassID:
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD1d64Q));
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-      AddDefaultPred(MIB.addFrameIndex(FI).addImm(16).addMemOperand(MMO));
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
     } else {
       MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
-                       .addFrameIndex(FI)
-                       .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                       .addFrameIndex(FI))
         .addMemOperand(MMO);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
@@ -880,9 +887,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
   case ARM::QQQQPRRegClassID: {
     MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
-                     .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                     .addFrameIndex(FI))
       .addMemOperand(MMO);
     MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
     MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
@@ -899,6 +905,53 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   }
 }
 
+unsigned
+ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDRrs:
+  case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::LDRi12:
+  case ARM::t2LDRi12:
+  case ARM::tRestore:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLD1q64Pseudo:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLDMQIA:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
 MachineInstr*
 ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
                                            int FrameIx, uint64_t Offset,
@@ -921,7 +974,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
   ARMConstantPoolValue *ACPV =
     static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
 
-  unsigned PCLabelId = AFI->createConstPoolEntryUId();
+  unsigned PCLabelId = AFI->createPICLabelUId();
   ARMConstantPoolValue *NewCPV = 0;
   // FIXME: The below assumes PIC relocation model and that the function
   // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
@@ -991,12 +1044,18 @@ ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const {
 }
 
 bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
-                                        const MachineInstr *MI1) const {
+                                        const MachineInstr *MI1,
+                                        const MachineRegisterInfo *MRI) const {
   int Opcode = MI0->getOpcode();
   if (Opcode == ARM::t2LDRpci ||
       Opcode == ARM::t2LDRpci_pic ||
       Opcode == ARM::tLDRpci ||
-      Opcode == ARM::tLDRpci_pic) {
+      Opcode == ARM::tLDRpci_pic ||
+      Opcode == ARM::MOV_ga_dyn ||
+      Opcode == ARM::MOV_ga_pcrel ||
+      Opcode == ARM::MOV_ga_pcrel_ldr ||
+      Opcode == ARM::t2MOV_ga_dyn ||
+      Opcode == ARM::t2MOV_ga_pcrel) {
     if (MI1->getOpcode() != Opcode)
       return false;
     if (MI0->getNumOperands() != MI1->getNumOperands())
@@ -1007,6 +1066,14 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
     if (MO0.getOffset() != MO1.getOffset())
       return false;
 
+    if (Opcode == ARM::MOV_ga_dyn ||
+        Opcode == ARM::MOV_ga_pcrel ||
+        Opcode == ARM::MOV_ga_pcrel_ldr ||
+        Opcode == ARM::t2MOV_ga_dyn ||
+        Opcode == ARM::t2MOV_ga_pcrel)
+      // Ignore the PC labels.
+      return MO0.getGlobal() == MO1.getGlobal();
+
     const MachineFunction *MF = MI0->getParent()->getParent();
     const MachineConstantPool *MCP = MF->getConstantPool();
     int CPI0 = MO0.getIndex();
@@ -1018,6 +1085,37 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
     ARMConstantPoolValue *ACPV1 =
       static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
     return ACPV0->hasSameValue(ACPV1);
+  } else if (Opcode == ARM::PICLDR) {
+    if (MI1->getOpcode() != Opcode)
+      return false;
+    if (MI0->getNumOperands() != MI1->getNumOperands())
+      return false;
+
+    unsigned Addr0 = MI0->getOperand(1).getReg();
+    unsigned Addr1 = MI1->getOperand(1).getReg();
+    if (Addr0 != Addr1) {
+      if (!MRI ||
+          !TargetRegisterInfo::isVirtualRegister(Addr0) ||
+          !TargetRegisterInfo::isVirtualRegister(Addr1))
+        return false;
+
+      // This assumes SSA form.
+      MachineInstr *Def0 = MRI->getVRegDef(Addr0);
+      MachineInstr *Def1 = MRI->getVRegDef(Addr1);
+      // Check if the loaded value, e.g. a constantpool of a global address, are
+      // the same.
+      if (!produceSameValue(Def0, Def1, MRI))
+        return false;
+    }
+
+    for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) {
+      // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg
+      const MachineOperand &MO0 = MI0->getOperand(i);
+      const MachineOperand &MO1 = MI1->getOperand(i);
+      if (!MO0.isIdenticalTo(MO1))
+        return false;
+    }
+    return true;
   }
 
   return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
@@ -1040,8 +1138,8 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   switch (Load1->getMachineOpcode()) {
   default:
     return false;
-  case ARM::LDR:
-  case ARM::LDRB:
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
   case ARM::LDRD:
   case ARM::LDRH:
   case ARM::LDRSB:
@@ -1059,8 +1157,8 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   switch (Load2->getMachineOpcode()) {
   default:
     return false;
-  case ARM::LDR:
-  case ARM::LDRB:
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
   case ARM::LDRD:
   case ARM::LDRH:
   case ARM::LDRSB:
@@ -1164,22 +1262,37 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   return false;
 }
 
-bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const {
-  if (!NumInstrs)
+bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                           unsigned NumCyles,
+                                           unsigned ExtraPredCycles,
+                                           float Probability,
+                                           float Confidence) const {
+  if (!NumCyles)
     return false;
-  if (Subtarget.getCPUString() == "generic")
-    // Generic (and overly aggressive) if-conversion limits for testing.
-    return NumInstrs <= 10;
-  else if (Subtarget.hasV7Ops())
-    return NumInstrs <= 3;
-  return NumInstrs <= 2;
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  float UnpredCost = Probability * NumCyles;
+  UnpredCost += 1.0; // The branch itself
+  UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty();
+
+  return (float)(NumCyles + ExtraPredCycles) < UnpredCost;
 }
-  
+
 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
-                    MachineBasicBlock &FMBB, unsigned NumF) const {
-  return NumT && NumF && NumT <= 2 && NumF <= 2;
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned TCycles, unsigned TExtra,
+                    MachineBasicBlock &FMBB,
+                    unsigned FCycles, unsigned FExtra,
+                    float Probability, float Confidence) const {
+  if (!TCycles || !FCycles)
+    return false;
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  float UnpredCost = Probability * TCycles + (1.0 - Probability) * FCycles;
+  UnpredCost += 1.0; // The branch itself
+  UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty();
+
+  return (float)(TCycles + FCycles + TExtra + FExtra) < UnpredCost;
 }
 
 /// getInstrPredicate - If instruction is predicated, returns its predicate
@@ -1292,6 +1405,12 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     unsigned NumBits = 0;
     unsigned Scale = 1;
     switch (AddrMode) {
+    case ARMII::AddrMode_i12: {
+      ImmIdx = FrameRegIdx + 1;
+      InstrOffs = MI.getOperand(ImmIdx).getImm();
+      NumBits = 12;
+      break;
+    }
     case ARMII::AddrMode2: {
       ImmIdx = FrameRegIdx+2;
       InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
@@ -1342,8 +1461,15 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       if ((unsigned)Offset <= Mask * Scale) {
         // Replace the FrameIndex with sp
         MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-        if (isSub)
-          ImmedOffset |= 1 << NumBits;
+        // FIXME: When addrmode2 goes away, this will simplify (like the
+        // T2 version), as the LDR.i12 versions don't need the encoding
+        // tricks for the offset value.
+        if (isSub) {
+          if (AddrMode == ARMII::AddrMode_i12)
+            ImmedOffset = -ImmedOffset;
+          else
+            ImmedOffset |= 1 << NumBits;
+        }
         ImmOp.ChangeToImmediate(ImmedOffset);
         Offset = 0;
         return true;
@@ -1351,8 +1477,12 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 
       // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
       ImmedOffset = ImmedOffset & Mask;
-      if (isSub)
-        ImmedOffset |= 1 << NumBits;
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode_i12)
+          ImmedOffset = -ImmedOffset;
+        else
+          ImmedOffset |= 1 << NumBits;
+      }
       ImmOp.ChangeToImmediate(ImmedOffset);
       Offset &= ~(Mask*Scale);
     }
@@ -1363,25 +1493,88 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 }
 
 bool ARMBaseInstrInfo::
-AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpValue) const {
+AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpMask,
+               int &CmpValue) const {
   switch (MI->getOpcode()) {
   default: break;
   case ARM::CMPri:
-  case ARM::CMPzri:
   case ARM::t2CMPri:
-  case ARM::t2CMPzri:
     SrcReg = MI->getOperand(0).getReg();
+    CmpMask = ~0;
     CmpValue = MI->getOperand(1).getImm();
     return true;
+  case ARM::TSTri:
+  case ARM::t2TSTri:
+    SrcReg = MI->getOperand(0).getReg();
+    CmpMask = MI->getOperand(1).getImm();
+    CmpValue = 0;
+    return true;
   }
 
   return false;
 }
 
-/// ConvertToSetZeroFlag - Convert the instruction to set the "zero" flag so
-/// that we can remove a "comparison with zero".
+/// isSuitableForMask - Identify a suitable 'and' instruction that
+/// operates on the given source register and applies the same mask
+/// as a 'tst' instruction. Provide a limited look-through for copies.
+/// When successful, MI will hold the found instruction.
+static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
+                              int CmpMask, bool CommonUse) {
+  switch (MI->getOpcode()) {
+    case ARM::ANDri:
+    case ARM::t2ANDri:
+      if (CmpMask != MI->getOperand(2).getImm())
+        return false;
+      if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg())
+        return true;
+      break;
+    case ARM::COPY: {
+      // Walk down one instruction which is potentially an 'and'.
+      const MachineInstr &Copy = *MI;
+      MachineBasicBlock::iterator AND(
+        llvm::next(MachineBasicBlock::iterator(MI)));
+      if (AND == MI->getParent()->end()) return false;
+      MI = AND;
+      return isSuitableForMask(MI, Copy.getOperand(0).getReg(),
+                               CmpMask, true);
+    }
+  }
+
+  return false;
+}
+
+/// OptimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
 bool ARMBaseInstrInfo::
-ConvertToSetZeroFlag(MachineInstr *MI, MachineInstr *CmpInstr) const {
+OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
+                     int CmpValue, const MachineRegisterInfo *MRI) const {
+  if (CmpValue != 0)
+    return false;
+
+  MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg);
+  if (llvm::next(DI) != MRI->def_end())
+    // Only support one definition.
+    return false;
+
+  MachineInstr *MI = &*DI;
+
+  // Masked compares sometimes use the same register as the corresponding 'and'.
+  if (CmpMask != ~0) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false)) {
+      MI = 0;
+      for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg),
+           UE = MRI->use_end(); UI != UE; ++UI) {
+        if (UI->getParent() != CmpInstr->getParent()) continue;
+        MachineInstr *PotentialAND = &*UI;
+        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true))
+          continue;
+        MI = PotentialAND;
+        break;
+      }
+      if (!MI) return false;
+    }
+  }
+
   // Conservatively refuse to convert an instruction which isn't in the same BB
   // as the comparison.
   if (MI->getParent() != CmpInstr->getParent())
@@ -1391,16 +1584,20 @@ ConvertToSetZeroFlag(MachineInstr *MI, MachineInstr *CmpInstr) const {
   // want to change.
   MachineBasicBlock::const_iterator I = CmpInstr, E = MI,
     B = MI->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B) return false;
+
   --I;
   for (; I != E; --I) {
     const MachineInstr &Instr = *I;
 
     for (unsigned IO = 0, EO = Instr.getNumOperands(); IO != EO; ++IO) {
       const MachineOperand &MO = Instr.getOperand(IO);
-      if (!MO.isReg() || !MO.isDef()) continue;
+      if (!MO.isReg()) continue;
 
-      // This instruction modifies CPSR before the one we want to change. We
-      // can't do this transformation.
+      // This instruction modifies or uses CPSR after the one we want to
+      // change. We can't do this transformation.
       if (MO.getReg() == ARM::CPSR)
         return false;
     }
@@ -1414,15 +1611,713 @@ ConvertToSetZeroFlag(MachineInstr *MI, MachineInstr *CmpInstr) const {
   switch (MI->getOpcode()) {
   default: break;
   case ARM::ADDri:
+  case ARM::ANDri:
+  case ARM::t2ANDri:
   case ARM::SUBri:
   case ARM::t2ADDri:
   case ARM::t2SUBri:
-    MI->RemoveOperand(5);
-    MachineInstrBuilder(MI)
-      .addReg(ARM::CPSR, RegState::Define | RegState::Implicit);
+    // Toggle the optional operand to CPSR.
+    MI->getOperand(5).setReg(ARM::CPSR);
+    MI->getOperand(5).setIsDef(true);
     CmpInstr->eraseFromParent();
     return true;
   }
 
   return false;
 }
+
+bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
+                                     MachineInstr *DefMI, unsigned Reg,
+                                     MachineRegisterInfo *MRI) const {
+  // Fold large immediates into add, sub, or, xor.
+  unsigned DefOpc = DefMI->getOpcode();
+  if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
+    return false;
+  if (!DefMI->getOperand(1).isImm())
+    // Could be t2MOVi32imm <ga:xx>
+    return false;
+
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  unsigned UseOpc = UseMI->getOpcode();
+  unsigned NewUseOpc = 0;
+  uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm();
+  uint32_t SOImmValV1 = 0, SOImmValV2 = 0;
+  bool Commute = false;
+  switch (UseOpc) {
+  default: return false;
+  case ARM::SUBrr:
+  case ARM::ADDrr:
+  case ARM::ORRrr:
+  case ARM::EORrr:
+  case ARM::t2SUBrr:
+  case ARM::t2ADDrr:
+  case ARM::t2ORRrr:
+  case ARM::t2EORrr: {
+    Commute = UseMI->getOperand(2).getReg() != Reg;
+    switch (UseOpc) {
+    default: break;
+    case ARM::SUBrr: {
+      if (Commute)
+        return false;
+      ImmVal = -ImmVal;
+      NewUseOpc = ARM::SUBri;
+      // Fallthrough
+    }
+    case ARM::ADDrr:
+    case ARM::ORRrr:
+    case ARM::EORrr: {
+      if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::ADDrr: NewUseOpc = ARM::ADDri; break;
+      case ARM::ORRrr: NewUseOpc = ARM::ORRri; break;
+      case ARM::EORrr: NewUseOpc = ARM::EORri; break;
+      }
+      break;
+    }
+    case ARM::t2SUBrr: {
+      if (Commute)
+        return false;
+      ImmVal = -ImmVal;
+      NewUseOpc = ARM::t2SUBri;
+      // Fallthrough
+    }
+    case ARM::t2ADDrr:
+    case ARM::t2ORRrr:
+    case ARM::t2EORrr: {
+      if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break;
+      case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break;
+      case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break;
+      }
+      break;
+    }
+    }
+  }
+  }
+
+  unsigned OpIdx = Commute ? 2 : 1;
+  unsigned Reg1 = UseMI->getOperand(OpIdx).getReg();
+  bool isKill = UseMI->getOperand(OpIdx).isKill();
+  unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+  AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(),
+                                      *UseMI, UseMI->getDebugLoc(),
+                                      get(NewUseOpc), NewReg)
+                              .addReg(Reg1, getKillRegState(isKill))
+                              .addImm(SOImmValV1)));
+  UseMI->setDesc(get(NewUseOpc));
+  UseMI->getOperand(1).setReg(NewReg);
+  UseMI->getOperand(1).setIsKill();
+  UseMI->getOperand(2).ChangeToImmediate(SOImmValV2);
+  DefMI->eraseFromParent();
+  return true;
+}
+
+unsigned
+ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                 const MachineInstr *MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  const TargetInstrDesc &Desc = MI->getDesc();
+  unsigned Class = Desc.getSchedClass();
+  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
+  if (UOps)
+    return UOps;
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected multi-uops instruction!");
+    break;
+  case ARM::VLDMQIA:
+  case ARM::VLDMQDB:
+  case ARM::VSTMQIA:
+  case ARM::VSTMQDB:
+    return 2;
+
+  // The number of uOps for load / store multiple are determined by the number
+  // registers.
+  //
+  // On Cortex-A8, each pair of register loads / stores can be scheduled on the
+  // same cycle. The scheduling for the first load / store must be done
+  // separately by assuming the the address is not 64-bit aligned.
+  //
+  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
+  // is not 64-bit aligned, then AGU would take an extra cycle.  For VFP / NEON
+  // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1.
+  case ARM::VLDMDIA:
+  case ARM::VLDMDDB:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSDB:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDDB:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSDB:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD: {
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
+    return (NumRegs / 2) + (NumRegs % 2) + 1;
+  }
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
+    if (Subtarget.isCortexA8()) {
+      if (NumRegs < 4)
+        return 2;
+      // 4 registers would be issued: 2, 2.
+      // 5 registers would be issued: 2, 2, 1.
+      UOps = (NumRegs / 2);
+      if (NumRegs % 2)
+        ++UOps;
+      return UOps;
+    } else if (Subtarget.isCortexA9()) {
+      UOps = (NumRegs / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((NumRegs % 2) ||
+          !MI->hasOneMemOperand() ||
+          (*MI->memoperands_begin())->getAlignment() < 8)
+        ++UOps;
+      return UOps;
+    } else {
+      // Assume the worst.
+      return NumRegs;
+    }
+  }
+  }
+}
+
+int
+ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
+                                  const TargetInstrDesc &DefTID,
+                                  unsigned DefClass,
+                                  unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8()) {
+    // (regno / 2) + (regno % 2) + 1
+    DefCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++DefCycle;
+  } else if (Subtarget.isCortexA9()) {
+    DefCycle = RegNo;
+    bool isSLoad = false;
+
+    switch (DefTID.getOpcode()) {
+    default: break;
+    case ARM::VLDMSIA:
+    case ARM::VLDMSDB:
+    case ARM::VLDMSIA_UPD:
+    case ARM::VLDMSDB_UPD:
+      isSLoad = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSLoad && (RegNo % 2)) || DefAlign < 8)
+      ++DefCycle;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
+                                 const TargetInstrDesc &DefTID,
+                                 unsigned DefClass,
+                                 unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8()) {
+    // 4 registers would be issued: 1, 2, 1.
+    // 5 registers would be issued: 1, 2, 2.
+    DefCycle = RegNo / 2;
+    if (DefCycle < 1)
+      DefCycle = 1;
+    // Result latency is issue cycle + 2: E2.
+    DefCycle += 2;
+  } else if (Subtarget.isCortexA9()) {
+    DefCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || DefAlign < 8)
+      ++DefCycle;
+    // Result latency is AGU cycles + 2.
+    DefCycle += 2;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
+                                  const TargetInstrDesc &UseTID,
+                                  unsigned UseClass,
+                                  unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8()) {
+    // (regno / 2) + (regno % 2) + 1
+    UseCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++UseCycle;
+  } else if (Subtarget.isCortexA9()) {
+    UseCycle = RegNo;
+    bool isSStore = false;
+
+    switch (UseTID.getOpcode()) {
+    default: break;
+    case ARM::VSTMSIA:
+    case ARM::VSTMSDB:
+    case ARM::VSTMSIA_UPD:
+    case ARM::VSTMSDB_UPD:
+      isSStore = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSStore && (RegNo % 2)) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = RegNo + 2;
+  }
+
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
+                                 const TargetInstrDesc &UseTID,
+                                 unsigned UseClass,
+                                 unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8()) {
+    UseCycle = RegNo / 2;
+    if (UseCycle < 2)
+      UseCycle = 2;
+    // Read in E3.
+    UseCycle += 2;
+  } else if (Subtarget.isCortexA9()) {
+    UseCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = 1;
+  }
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const TargetInstrDesc &DefTID,
+                                    unsigned DefIdx, unsigned DefAlign,
+                                    const TargetInstrDesc &UseTID,
+                                    unsigned UseIdx, unsigned UseAlign) const {
+  unsigned DefClass = DefTID.getSchedClass();
+  unsigned UseClass = UseTID.getSchedClass();
+
+  if (DefIdx < DefTID.getNumDefs() && UseIdx < UseTID.getNumOperands())
+    return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+
+  // This may be a def / use of a variable_ops instruction, the operand
+  // latency might be determinable dynamically. Let the target try to
+  // figure it out.
+  int DefCycle = -1;
+  bool LdmBypass = false;
+  switch (DefTID.getOpcode()) {
+  default:
+    DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    break;
+
+  case ARM::VLDMDIA:
+  case ARM::VLDMDDB:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSDB:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+    DefCycle = getVLDMDefCycle(ItinData, DefTID, DefClass, DefIdx, DefAlign);
+    break;
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+    LdmBypass = 1;
+    DefCycle = getLDMDefCycle(ItinData, DefTID, DefClass, DefIdx, DefAlign);
+    break;
+  }
+
+  if (DefCycle == -1)
+    // We can't seem to determine the result latency of the def, assume it's 2.
+    DefCycle = 2;
+
+  int UseCycle = -1;
+  switch (UseTID.getOpcode()) {
+  default:
+    UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+    break;
+
+  case ARM::VSTMDIA:
+  case ARM::VSTMDDB:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSDB:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+    UseCycle = getVSTMUseCycle(ItinData, UseTID, UseClass, UseIdx, UseAlign);
+    break;
+
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tSTMIA:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    UseCycle = getSTMUseCycle(ItinData, UseTID, UseClass, UseIdx, UseAlign);
+    break;
+  }
+
+  if (UseCycle == -1)
+    // Assume it's read in the first stage.
+    UseCycle = 1;
+
+  UseCycle = DefCycle - UseCycle + 1;
+  if (UseCycle > 0) {
+    if (LdmBypass) {
+      // It's a variable_ops instruction so we can't use DefIdx here. Just use
+      // first def operand.
+      if (ItinData->hasPipelineForwarding(DefClass, DefTID.getNumOperands()-1,
+                                          UseClass, UseIdx))
+        --UseCycle;
+    } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx,
+                                               UseClass, UseIdx)) {
+      --UseCycle;
+    }
+  }
+
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI, unsigned UseIdx) const {
+  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
+      DefMI->isRegSequence() || DefMI->isImplicitDef())
+    return 1;
+
+  const TargetInstrDesc &DefTID = DefMI->getDesc();
+  if (!ItinData || ItinData->isEmpty())
+    return DefTID.mayLoad() ? 3 : 1;
+
+  const TargetInstrDesc &UseTID = UseMI->getDesc();
+  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  if (DefMO.getReg() == ARM::CPSR) {
+    if (DefMI->getOpcode() == ARM::FMSTAT) {
+      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
+      return Subtarget.isCortexA9() ? 1 : 20;
+    }
+
+    // CPSR set and branch can be paired in the same cycle.
+    if (UseTID.isBranch())
+      return 0;
+  }
+
+  unsigned DefAlign = DefMI->hasOneMemOperand()
+    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
+  unsigned UseAlign = UseMI->hasOneMemOperand()
+    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
+  int Latency = getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
+                                  UseTID, UseIdx, UseAlign);
+
+  if (Latency > 1 &&
+      (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefTID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Latency;
+      break;
+    }
+    }
+  }
+
+  return Latency;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    SDNode *DefNode, unsigned DefIdx,
+                                    SDNode *UseNode, unsigned UseIdx) const {
+  if (!DefNode->isMachineOpcode())
+    return 1;
+
+  const TargetInstrDesc &DefTID = get(DefNode->getMachineOpcode());
+
+  if (isZeroCost(DefTID.Opcode))
+    return 0;
+
+  if (!ItinData || ItinData->isEmpty())
+    return DefTID.mayLoad() ? 3 : 1;
+
+  if (!UseNode->isMachineOpcode()) {
+    int Latency = ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
+    if (Subtarget.isCortexA9())
+      return Latency <= 2 ? 1 : Latency - 1;
+    else
+      return Latency <= 3 ? 1 : Latency - 2;
+  }
+
+  const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode());
+  const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+  unsigned DefAlign = !DefMN->memoperands_empty()
+    ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+  const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+  unsigned UseAlign = !UseMN->memoperands_empty()
+    ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+  int Latency = getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
+                                  UseTID, UseIdx, UseAlign);
+
+  if (Latency > 1 &&
+      (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefTID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Latency;
+      break;
+    }
+    }
+  }
+
+  return Latency;
+}
+
+int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                      const MachineInstr *MI,
+                                      unsigned *PredCost) const {
+  if (MI->isCopyLike() || MI->isInsertSubreg() ||
+      MI->isRegSequence() || MI->isImplicitDef())
+    return 1;
+
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Class = TID.getSchedClass();
+  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
+  if (PredCost && TID.hasImplicitDefOfPhysReg(ARM::CPSR))
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    *PredCost = 1;
+  if (UOps)
+    return ItinData->getStageLatency(Class);
+  return getNumMicroOps(ItinData, MI);
+}
+
+int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                      SDNode *Node) const {
+  if (!Node->isMachineOpcode())
+    return 1;
+
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Opcode = Node->getMachineOpcode();
+  switch (Opcode) {
+  default:
+    return ItinData->getStageLatency(get(Opcode).getSchedClass());
+  case ARM::VLDMQIA:
+  case ARM::VLDMQDB:
+  case ARM::VSTMQIA:
+  case ARM::VSTMQDB:
+    return 2;
+  }
+}
+
+bool ARMBaseInstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask;
+  if (Subtarget.isCortexA8() &&
+      (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
+    // CortexA8 VFP instructions are not pipelined.
+    return true;
+
+  // Hoist VFP / NEON instructions with 4 or higher latency.
+  int Latency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  if (Latency <= 3)
+    return false;
+  return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
+         UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON;
+}
+
+bool ARMBaseInstrInfo::
+hasLowDefLatency(const InstrItineraryData *ItinData,
+                 const MachineInstr *DefMI, unsigned DefIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  if (DDomain == ARMII::DomainGeneral) {
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    return (DefCycle != -1 && DefCycle <= 2);
+  }
+  return false;
+}
+
+bool
+ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                                     unsigned &AddSubOpc,
+                                     bool &NegAcc, bool &HasLane) const {
+  DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode);
+  if (I == MLxEntryMap.end())
+    return false;
+
+  const ARM_MLxEntry &Entry = ARM_MLxTable[I->second];
+  MulOpc = Entry.MulOpc;
+  AddSubOpc = Entry.AddSubOpc;
+  NegAcc = Entry.NegAcc;
+  HasLane = Entry.HasLane;
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index b4f4a33..1fb8872 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -17,6 +17,8 @@
 #include "ARM.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 
 namespace llvm {
   class ARMSubtarget;
@@ -33,7 +35,7 @@ namespace ARMII {
     //===------------------------------------------------------------------===//
     // This four-bit field describes the addressing mode used.
 
-    AddrModeMask  = 0xf,
+    AddrModeMask  = 0x1f,
     AddrModeNone    = 0,
     AddrMode1       = 1,
     AddrMode2       = 2,
@@ -50,9 +52,10 @@ namespace ARMII {
     AddrModeT2_so   = 13,
     AddrModeT2_pc   = 14, // +/- i12 for pc relative data
     AddrModeT2_i8s4 = 15, // i8 * 4
+    AddrMode_i12    = 16,
 
     // Size* - Flags to keep track of the size of an instruction.
-    SizeShift     = 4,
+    SizeShift     = 5,
     SizeMask      = 7 << SizeShift,
     SizeSpecial   = 1,   // 0 byte pseudo or special case.
     Size8Bytes    = 2,
@@ -61,7 +64,7 @@ namespace ARMII {
 
     // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
     // and store ops only.  Generic "updating" flag is used for ld/st multiple.
-    IndexModeShift = 7,
+    IndexModeShift = 8,
     IndexModeMask  = 3 << IndexModeShift,
     IndexModePre   = 1,
     IndexModePost  = 2,
@@ -70,7 +73,7 @@ namespace ARMII {
     //===------------------------------------------------------------------===//
     // Instruction encoding formats.
     //
-    FormShift     = 9,
+    FormShift     = 10,
     FormMask      = 0x3f << FormShift,
 
     // Pseudo instructions
@@ -143,15 +146,15 @@ namespace ARMII {
 
     // UnaryDP - Indicates this is a unary data processing instruction, i.e.
     // it doesn't have a Rn operand.
-    UnaryDP       = 1 << 15,
+    UnaryDP       = 1 << 16,
 
     // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
     // a 16-bit Thumb instruction if certain conditions are met.
-    Xform16Bit    = 1 << 16,
+    Xform16Bit    = 1 << 17,
 
     //===------------------------------------------------------------------===//
     // Code domain.
-    DomainShift   = 17,
+    DomainShift   = 18,
     DomainMask    = 3 << DomainShift,
     DomainGeneral = 0 << DomainShift,
     DomainVFP     = 1 << DomainShift,
@@ -160,6 +163,11 @@ namespace ARMII {
     //===------------------------------------------------------------------===//
     // Field shifts - such shifts are used to set field while generating
     // machine instructions.
+    //
+    // FIXME: This list will need adjusting/fixing as the MC code emitter
+    // takes shape and the ARMCodeEmitter.cpp bits go away.
+    ShiftTypeShift = 4,
+
     M_BitShift     = 5,
     ShiftImmShift  = 5,
     ShiftShift     = 7,
@@ -181,29 +189,15 @@ namespace ARMII {
     I_BitShift     = 25,
     CondShift      = 28
   };
-
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // ARM Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    /// MO_LO16 - On a symbol operand, this represents a relocation containing
-    /// lower 16 bit of the address. Used only via movw instruction.
-    MO_LO16,
-
-    /// MO_HI16 - On a symbol operand, this represents a relocation containing
-    /// higher 16 bit of the address. Used only via movt instruction.
-    MO_HI16
-  };
 }
 
 class ARMBaseInstrInfo : public TargetInstrInfoImpl {
   const ARMSubtarget &Subtarget;
+
 protected:
   // Can be only subclassed.
   explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+
 public:
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
@@ -216,10 +210,13 @@ public:
   virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
-  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetMachine *TM,
+                               const ScheduleDAG *DAG) const;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const;
 
   // Branch analysis.
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -301,7 +298,8 @@ public:
   MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
 
   virtual bool produceSameValue(const MachineInstr *MI0,
-                                const MachineInstr *MI1) const;
+                                const MachineInstr *MI1,
+                                const MachineRegisterInfo *MRI) const;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
   /// determine if two loads are loading from the same base address. It should
@@ -328,26 +326,117 @@ public:
                                     const MachineFunction &MF) const;
 
   virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumInstrs) const;
+                                   unsigned NumCyles, unsigned ExtraPredCycles,
+                                   float Prob, float Confidence) const;
 
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,unsigned NumT,
-                                   MachineBasicBlock &FMBB,unsigned NumF) const;
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumT, unsigned ExtraT,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumF, unsigned ExtraF,
+                                   float Probability, float Confidence) const;
 
   virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumInstrs) const {
-    return NumInstrs && NumInstrs == 1;
+                                         unsigned NumCyles,
+                                         float Probability,
+                                         float Confidence) const {
+    return NumCyles == 1;
   }
 
   /// AnalyzeCompare - For a comparison instruction, return the source register
   /// in SrcReg and the value it compares against in CmpValue. Return true if
   /// the comparison instruction can be analyzed.
   virtual bool AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                              int &CmpValue) const;
+                              int &CmpMask, int &CmpValue) const;
 
-  /// ConvertToSetZeroFlag - Convert the instruction to set the zero flag so
+  /// OptimizeCompareInstr - Convert the instruction to set the zero flag so
   /// that we can remove a "comparison with zero".
-  virtual bool ConvertToSetZeroFlag(MachineInstr *Instr,
-                                    MachineInstr *CmpInstr) const;
+  virtual bool OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    int CmpMask, int CmpValue,
+                                    const MachineRegisterInfo *MRI) const;
+
+  /// FoldImmediate - 'Reg' is known to be defined by a move immediate
+  /// instruction, try to fold the immediate into the use instruction.
+  virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                             unsigned Reg, MachineRegisterInfo *MRI) const;
+
+  virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
+                                  const MachineInstr *MI) const;
+
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx,
+                        const MachineInstr *UseMI, unsigned UseIdx) const;
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const;
+private:
+  int getVLDMDefCycle(const InstrItineraryData *ItinData,
+                      const TargetInstrDesc &DefTID,
+                      unsigned DefClass,
+                      unsigned DefIdx, unsigned DefAlign) const;
+  int getLDMDefCycle(const InstrItineraryData *ItinData,
+                     const TargetInstrDesc &DefTID,
+                     unsigned DefClass,
+                     unsigned DefIdx, unsigned DefAlign) const;
+  int getVSTMUseCycle(const InstrItineraryData *ItinData,
+                      const TargetInstrDesc &UseTID,
+                      unsigned UseClass,
+                      unsigned UseIdx, unsigned UseAlign) const;
+  int getSTMUseCycle(const InstrItineraryData *ItinData,
+                     const TargetInstrDesc &UseTID,
+                     unsigned UseClass,
+                     unsigned UseIdx, unsigned UseAlign) const;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const TargetInstrDesc &DefTID,
+                        unsigned DefIdx, unsigned DefAlign,
+                        const TargetInstrDesc &UseTID,
+                        unsigned UseIdx, unsigned UseAlign) const;
+
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      const MachineInstr *MI, unsigned *PredCost = 0) const;
+
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const;
+
+  bool hasHighOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI, unsigned UseIdx) const;
+  bool hasLowDefLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx) const;
+
+private:
+  /// Modeling special VFP / NEON fp MLA / MLS hazards.
+
+  /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal
+  /// MLx table.
+  DenseMap<unsigned, unsigned> MLxEntryMap;
+
+  /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause
+  /// stalls when scheduled together with fp MLA / MLS opcodes.
+  SmallSet<unsigned, 16> MLxHazardOpcodes;
+
+public:
+  /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS
+  /// instruction.
+  bool isFpMLxInstruction(unsigned Opcode) const {
+    return MLxEntryMap.count(Opcode);
+  }
+
+  /// isFpMLxInstruction - This version also returns the multiply opcode and the
+  /// addition / subtraction opcode to expand to. Return true for 'HasLane' for
+  /// the MLX instructions with an extra lane operand.
+  bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                          unsigned &AddSubOpc, bool &NegAcc,
+                          bool &HasLane) const;
+
+  /// canCauseFpMLxStall - Return true if an instruction of the specified opcode
+  /// will cause stalls when scheduled after (within 4-cycle window) a fp
+  /// MLA / MLS instruction.
+  bool canCauseFpMLxStall(unsigned Opcode) const {
+    return MLxHazardOpcodes.count(Opcode);
+  }
 };
 
 static inline
@@ -389,7 +478,7 @@ bool isJumpTableBranchOpcode(int Opc) {
 
 static inline
 bool isIndirectBranchOpcode(int Opc) {
-  return Opc == ARM::BRIND || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
+  return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
 }
 
 /// getInstrPredicate - If instruction is predicated, returns its predicate
@@ -413,6 +502,12 @@ void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                             unsigned DestReg, unsigned BaseReg, int NumBytes,
                             ARMCC::CondCodes Pred, unsigned PredReg,
                             const ARMBaseInstrInfo &TII);
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI,
+                               unsigned DestReg, unsigned BaseReg,
+                               int NumBytes, const TargetInstrInfo &TII,
+                               const ARMBaseRegisterInfo& MRI,
+                               DebugLoc dl);
 
 
 /// rewriteARMFrameIndex / rewriteT2FrameIndex -
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index eceafad..67a4b7d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -15,6 +15,7 @@
 #include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
+#include "ARMFrameLowering.h"
 #include "ARMInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
@@ -32,120 +33,25 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
 
-namespace llvm {
+using namespace llvm;
+
 static cl::opt<bool>
 ForceAllBaseRegAlloc("arm-force-base-reg-alloc", cl::Hidden, cl::init(false),
           cl::desc("Force use of virtual base registers for stack load/store"));
 static cl::opt<bool>
 EnableLocalStackAlloc("enable-local-stack-alloc", cl::init(true), cl::Hidden,
           cl::desc("Enable pre-regalloc stack frame index allocation"));
-}
-
-using namespace llvm;
-
 static cl::opt<bool>
 EnableBasePointer("arm-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
-unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
-                                                   bool *isSPVFP) {
-  if (isSPVFP)
-    *isSPVFP = false;
-
-  using namespace ARM;
-  switch (RegEnum) {
-  default:
-    llvm_unreachable("Unknown ARM register!");
-  case R0:  case D0:  case Q0:  return 0;
-  case R1:  case D1:  case Q1:  return 1;
-  case R2:  case D2:  case Q2:  return 2;
-  case R3:  case D3:  case Q3:  return 3;
-  case R4:  case D4:  case Q4:  return 4;
-  case R5:  case D5:  case Q5:  return 5;
-  case R6:  case D6:  case Q6:  return 6;
-  case R7:  case D7:  case Q7:  return 7;
-  case R8:  case D8:  case Q8:  return 8;
-  case R9:  case D9:  case Q9:  return 9;
-  case R10: case D10: case Q10: return 10;
-  case R11: case D11: case Q11: return 11;
-  case R12: case D12: case Q12: return 12;
-  case SP:  case D13: case Q13: return 13;
-  case LR:  case D14: case Q14: return 14;
-  case PC:  case D15: case Q15: return 15;
-
-  case D16: return 16;
-  case D17: return 17;
-  case D18: return 18;
-  case D19: return 19;
-  case D20: return 20;
-  case D21: return 21;
-  case D22: return 22;
-  case D23: return 23;
-  case D24: return 24;
-  case D25: return 25;
-  case D26: return 26;
-  case D27: return 27;
-  case D28: return 28;
-  case D29: return 29;
-  case D30: return 30;
-  case D31: return 31;
-
-  case S0: case S1: case S2: case S3:
-  case S4: case S5: case S6: case S7:
-  case S8: case S9: case S10: case S11:
-  case S12: case S13: case S14: case S15:
-  case S16: case S17: case S18: case S19:
-  case S20: case S21: case S22: case S23:
-  case S24: case S25: case S26: case S27:
-  case S28: case S29: case S30: case S31: {
-    if (isSPVFP)
-      *isSPVFP = true;
-    switch (RegEnum) {
-    default: return 0; // Avoid compile time warning.
-    case S0: return 0;
-    case S1: return 1;
-    case S2: return 2;
-    case S3: return 3;
-    case S4: return 4;
-    case S5: return 5;
-    case S6: return 6;
-    case S7: return 7;
-    case S8: return 8;
-    case S9: return 9;
-    case S10: return 10;
-    case S11: return 11;
-    case S12: return 12;
-    case S13: return 13;
-    case S14: return 14;
-    case S15: return 15;
-    case S16: return 16;
-    case S17: return 17;
-    case S18: return 18;
-    case S19: return 19;
-    case S20: return 20;
-    case S21: return 21;
-    case S22: return 22;
-    case S23: return 23;
-    case S24: return 24;
-    case S25: return 25;
-    case S26: return 26;
-    case S27: return 27;
-    case S28: return 28;
-    case S29: return 29;
-    case S30: return 30;
-    case S31: return 31;
-    }
-  }
-  }
-}
-
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
                                          const ARMSubtarget &sti)
   : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
@@ -180,12 +86,14 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
   // FIXME: avoid re-calculating this everytime.
   BitVector Reserved(getNumRegs());
   Reserved.set(ARM::SP);
   Reserved.set(ARM::PC);
   Reserved.set(ARM::FPSCR);
-  if (hasFP(MF))
+  if (TFI->hasFP(MF))
     Reserved.set(FramePtr);
   if (hasBasePointer(MF))
     Reserved.set(BasePtr);
@@ -197,6 +105,8 @@ getReservedRegs(const MachineFunction &MF) const {
 
 bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF,
                                         unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
   switch (Reg) {
   default: break;
   case ARM::SP:
@@ -208,7 +118,7 @@ bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF,
     break;
   case ARM::R7:
   case ARM::R11:
-    if (FramePtr == Reg && hasFP(MF))
+    if (FramePtr == Reg && TFI->hasFP(MF))
       return true;
     break;
   case ARM::R9:
@@ -444,6 +354,7 @@ std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
 ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
                                         unsigned HintType, unsigned HintReg,
                                         const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   // Alternative register allocation orders when favoring even / odd registers
   // of register pairs.
 
@@ -525,7 +436,7 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
       return std::make_pair(RC->allocation_order_begin(MF),
                             RC->allocation_order_end(MF));
 
-    if (!hasFP(MF)) {
+    if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
         return std::make_pair(GPREven1,
                               GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
@@ -554,7 +465,7 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
       return std::make_pair(RC->allocation_order_begin(MF),
                             RC->allocation_order_end(MF));
 
-    if (!hasFP(MF)) {
+    if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
         return std::make_pair(GPROdd1,
                               GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
@@ -606,7 +517,7 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
   std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
   if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
        Hint.first == (unsigned)ARMRI::RegPairEven) &&
-      Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+      TargetRegisterInfo::isVirtualRegister(Hint.second)) {
     // If 'Reg' is one of the even / odd register pair and it's now changed
     // (e.g. coalesced) into a different register. The other register of the
     // pair allocation hint must be updated to reflect the relationship
@@ -619,23 +530,6 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
   }
 }
 
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.  This is true if the function has variable sized allocas
-/// or if frame pointer elimination is disabled.
-///
-bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const {
-  // Mac OS X requires FP not to be clobbered for backtracing purpose.
-  if (STI.isTargetDarwin())
-    return true;
-
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Always eliminate non-leaf frame pointers.
-  return ((DisableFramePointerElim(MF) && MFI->hasCalls()) ||
-          needsStackRealignment(MF) ||
-          MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
 bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -681,7 +575,7 @@ bool ARMBaseRegisterInfo::
 needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
   bool requiresRealignment = ((MFI->getLocalFrameMaxAlign() > StackAlign) ||
                                F->hasFnAttr(Attribute::StackAlignment));
 
@@ -697,417 +591,19 @@ cannotEliminateFrame(const MachineFunction &MF) const {
     || needsStackRealignment(MF);
 }
 
-/// estimateStackSize - Estimate and return the size of the frame.
-static unsigned estimateStackSize(MachineFunction &MF) {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  int Offset = 0;
-  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -FFI->getObjectOffset(i);
-    if (FixedOff > Offset) Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-    Offset += FFI->getObjectSize(i);
-    unsigned Align = FFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset+Align-1)/Align*Align;
-  }
-  return (unsigned)Offset;
-}
-
-/// estimateRSStackSizeLimit - Look at each instruction that references stack
-/// frames and return the stack size limit beyond which some of these
-/// instructions will require a scratch register during their expansion later.
-unsigned
-ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
-  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned Limit = (1 << 12) - 1;
-  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!I->getOperand(i).isFI()) continue;
-
-        // When using ADDri to get the address of a stack object, 255 is the
-        // largest offset guaranteed to fit in the immediate offset.
-        if (I->getOpcode() == ARM::ADDri) {
-          Limit = std::min(Limit, (1U << 8) - 1);
-          break;
-        }
-
-        // Otherwise check the addressing mode.
-        switch (I->getDesc().TSFlags & ARMII::AddrModeMask) {
-        case ARMII::AddrMode3:
-        case ARMII::AddrModeT2_i8:
-          Limit = std::min(Limit, (1U << 8) - 1);
-          break;
-        case ARMII::AddrMode5:
-        case ARMII::AddrModeT2_i8s4:
-          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
-          break;
-        case ARMII::AddrModeT2_i12:
-          // i12 supports only positive offset so these will be converted to
-          // i8 opcodes. See llvm::rewriteT2FrameIndex.
-          if (hasFP(MF) && AFI->hasStackFrame())
-            Limit = std::min(Limit, (1U << 8) - 1);
-          break;
-        case ARMII::AddrMode6:
-          // Addressing mode 6 (load/store) instructions can't encode an
-          // immediate offset for stack references.
-          return 0;
-        default:
-          break;
-        }
-        break; // At most one FI per instruction
-      }
-    }
-  }
-
-  return Limit;
-}
-
-static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
-                                       const ARMBaseInstrInfo &TII) {
-  unsigned FnSize = 0;
-  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
-       MBBI != E; ++MBBI) {
-    const MachineBasicBlock &MBB = *MBBI;
-    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
-         I != E; ++I)
-      FnSize += TII.GetInstSizeInBytes(I);
-  }
-  return FnSize;
-}
-
-void
-ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
-  // This tells PEI to spill the FP as if it is any other callee-save register
-  // to take advantage the eliminateFrameIndex machinery. This also ensures it
-  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
-  // to combine multiple loads / stores.
-  bool CanEliminateFrame = true;
-  bool CS1Spilled = false;
-  bool LRSpilled = false;
-  unsigned NumGPRSpills = 0;
-  SmallVector<unsigned, 4> UnspilledCS1GPRs;
-  SmallVector<unsigned, 4> UnspilledCS2GPRs;
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
-  // scratch register.
-  // FIXME: It will be better just to find spare register here.
-  if (needsStackRealignment(MF) &&
-      AFI->isThumb2Function())
-    MF.getRegInfo().setPhysRegUsed(ARM::R4);
-
-  // Spill LR if Thumb1 function uses variable length argument lists.
-  if (AFI->isThumb1OnlyFunction() && AFI->getVarArgsRegSaveSize() > 0)
-    MF.getRegInfo().setPhysRegUsed(ARM::LR);
-
-  // Spill the BasePtr if it's used.
-  if (hasBasePointer(MF))
-    MF.getRegInfo().setPhysRegUsed(BasePtr);
-
-  // Don't spill FP if the frame can be eliminated. This is determined
-  // by scanning the callee-save registers to see if any is used.
-  const unsigned *CSRegs = getCalleeSavedRegs();
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    unsigned Reg = CSRegs[i];
-    bool Spilled = false;
-    if (MF.getRegInfo().isPhysRegUsed(Reg)) {
-      AFI->setCSRegisterIsSpilled(Reg);
-      Spilled = true;
-      CanEliminateFrame = false;
-    } else {
-      // Check alias registers too.
-      for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
-        if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
-          Spilled = true;
-          CanEliminateFrame = false;
-        }
-      }
-    }
-
-    if (!ARM::GPRRegisterClass->contains(Reg))
-      continue;
-
-    if (Spilled) {
-      NumGPRSpills++;
-
-      if (!STI.isTargetDarwin()) {
-        if (Reg == ARM::LR)
-          LRSpilled = true;
-        CS1Spilled = true;
-        continue;
-      }
-
-      // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
-      switch (Reg) {
-      case ARM::LR:
-        LRSpilled = true;
-        // Fallthrough
-      case ARM::R4:
-      case ARM::R5:
-      case ARM::R6:
-      case ARM::R7:
-        CS1Spilled = true;
-        break;
-      default:
-        break;
-      }
-    } else {
-      if (!STI.isTargetDarwin()) {
-        UnspilledCS1GPRs.push_back(Reg);
-        continue;
-      }
-
-      switch (Reg) {
-      case ARM::R4:
-      case ARM::R5:
-      case ARM::R6:
-      case ARM::R7:
-      case ARM::LR:
-        UnspilledCS1GPRs.push_back(Reg);
-        break;
-      default:
-        UnspilledCS2GPRs.push_back(Reg);
-        break;
-      }
-    }
-  }
-
-  bool ForceLRSpill = false;
-  if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
-    unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
-    // Force LR to be spilled if the Thumb function size is > 2048. This enables
-    // use of BL to implement far jump. If it turns out that it's not needed
-    // then the branch fix up path will undo it.
-    if (FnSize >= (1 << 11)) {
-      CanEliminateFrame = false;
-      ForceLRSpill = true;
-    }
-  }
-
-  // If any of the stack slot references may be out of range of an immediate
-  // offset, make sure a register (or a spill slot) is available for the
-  // register scavenger. Note that if we're indexing off the frame pointer, the
-  // effective stack size is 4 bytes larger since the FP points to the stack
-  // slot of the previous FP. Also, if we have variable sized objects in the
-  // function, stack slot references will often be negative, and some of
-  // our instructions are positive-offset only, so conservatively consider
-  // that case to want a spill slot (or register) as well. Similarly, if
-  // the function adjusts the stack pointer during execution and the
-  // adjustments aren't already part of our stack size estimate, our offset
-  // calculations may be off, so be conservative.
-  // FIXME: We could add logic to be more precise about negative offsets
-  //        and which instructions will need a scratch register for them. Is it
-  //        worth the effort and added fragility?
-  bool BigStack =
-    (RS &&
-     (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
-      estimateRSStackSizeLimit(MF)))
-    || MFI->hasVarSizedObjects()
-    || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
-
-  bool ExtraCSSpill = false;
-  if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) {
-    AFI->setHasStackFrame(true);
-
-    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
-    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
-    if (!LRSpilled && CS1Spilled) {
-      MF.getRegInfo().setPhysRegUsed(ARM::LR);
-      AFI->setCSRegisterIsSpilled(ARM::LR);
-      NumGPRSpills++;
-      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
-                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
-      ForceLRSpill = false;
-      ExtraCSSpill = true;
-    }
-
-    if (hasFP(MF)) {
-      MF.getRegInfo().setPhysRegUsed(FramePtr);
-      NumGPRSpills++;
-    }
-
-    // If stack and double are 8-byte aligned and we are spilling an odd number
-    // of GPRs. Spill one extra callee save GPR so we won't have to pad between
-    // the integer and double callee save areas.
-    unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
-    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
-      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
-        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
-          unsigned Reg = UnspilledCS1GPRs[i];
-          // Don't spill high register if the function is thumb1
-          if (!AFI->isThumb1OnlyFunction() ||
-              isARMLowRegister(Reg) || Reg == ARM::LR) {
-            MF.getRegInfo().setPhysRegUsed(Reg);
-            AFI->setCSRegisterIsSpilled(Reg);
-            if (!isReservedReg(MF, Reg))
-              ExtraCSSpill = true;
-            break;
-          }
-        }
-      } else if (!UnspilledCS2GPRs.empty() &&
-                 !AFI->isThumb1OnlyFunction()) {
-        unsigned Reg = UnspilledCS2GPRs.front();
-        MF.getRegInfo().setPhysRegUsed(Reg);
-        AFI->setCSRegisterIsSpilled(Reg);
-        if (!isReservedReg(MF, Reg))
-          ExtraCSSpill = true;
-      }
-    }
-
-    // Estimate if we might need to scavenge a register at some point in order
-    // to materialize a stack offset. If so, either spill one additional
-    // callee-saved register or reserve a special spill slot to facilitate
-    // register scavenging. Thumb1 needs a spill slot for stack pointer
-    // adjustments also, even when the frame itself is small.
-    if (BigStack && !ExtraCSSpill) {
-      // If any non-reserved CS register isn't spilled, just spill one or two
-      // extra. That should take care of it!
-      unsigned NumExtras = TargetAlign / 4;
-      SmallVector<unsigned, 2> Extras;
-      while (NumExtras && !UnspilledCS1GPRs.empty()) {
-        unsigned Reg = UnspilledCS1GPRs.back();
-        UnspilledCS1GPRs.pop_back();
-        if (!isReservedReg(MF, Reg) &&
-            (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
-             Reg == ARM::LR)) {
-          Extras.push_back(Reg);
-          NumExtras--;
-        }
-      }
-      // For non-Thumb1 functions, also check for hi-reg CS registers
-      if (!AFI->isThumb1OnlyFunction()) {
-        while (NumExtras && !UnspilledCS2GPRs.empty()) {
-          unsigned Reg = UnspilledCS2GPRs.back();
-          UnspilledCS2GPRs.pop_back();
-          if (!isReservedReg(MF, Reg)) {
-            Extras.push_back(Reg);
-            NumExtras--;
-          }
-        }
-      }
-      if (Extras.size() && NumExtras == 0) {
-        for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
-          MF.getRegInfo().setPhysRegUsed(Extras[i]);
-          AFI->setCSRegisterIsSpilled(Extras[i]);
-        }
-      } else if (!AFI->isThumb1OnlyFunction()) {
-        // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
-        // closest to SP or frame pointer.
-        const TargetRegisterClass *RC = ARM::GPRRegisterClass;
-        RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
-                                                           RC->getAlignment(),
-                                                           false));
-      }
-    }
-  }
-
-  if (ForceLRSpill) {
-    MF.getRegInfo().setPhysRegUsed(ARM::LR);
-    AFI->setCSRegisterIsSpilled(ARM::LR);
-    AFI->setLRIsSpilledForFarJump(true);
-  }
-}
-
 unsigned ARMBaseRegisterInfo::getRARegister() const {
   return ARM::LR;
 }
 
 unsigned
 ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  if (hasFP(MF))
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (TFI->hasFP(MF))
     return FramePtr;
   return ARM::SP;
 }
 
-// Provide a base+offset reference to an FI slot for debug info. It's the
-// same as what we use for resolving the code-gen references for now.
-// FIXME: This can go wrong when references are SP-relative and simple call
-//        frames aren't used.
-int
-ARMBaseRegisterInfo::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                            unsigned &FrameReg) const {
-  return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
-}
-
-int
-ARMBaseRegisterInfo::ResolveFrameIndexReference(const MachineFunction &MF,
-                                                int FI,
-                                                unsigned &FrameReg,
-                                                int SPAdj) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
-  int FPOffset = Offset - AFI->getFramePtrSpillOffset();
-  bool isFixed = MFI->isFixedObjectIndex(FI);
-
-  FrameReg = ARM::SP;
-  Offset += SPAdj;
-  if (AFI->isGPRCalleeSavedArea1Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea2Offset();
-  else if (AFI->isDPRCalleeSavedAreaFrame(FI))
-    return Offset - AFI->getDPRCalleeSavedAreaOffset();
-
-  // When dynamically realigning the stack, use the frame pointer for
-  // parameters, and the stack/base pointer for locals.
-  if (needsStackRealignment(MF)) {
-    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
-    if (isFixed) {
-      FrameReg = getFrameRegister(MF);
-      Offset = FPOffset;
-    } else if (MFI->hasVarSizedObjects()) {
-      assert(hasBasePointer(MF) &&
-             "VLAs and dynamic stack alignment, but missing base pointer!");
-      FrameReg = BasePtr;
-    }
-    return Offset;
-  }
-
-  // If there is a frame pointer, use it when we can.
-  if (hasFP(MF) && AFI->hasStackFrame()) {
-    // Use frame pointer to reference fixed objects. Use it for locals if
-    // there are VLAs (and thus the SP isn't reliable as a base).
-    if (isFixed || (MFI->hasVarSizedObjects() && !hasBasePointer(MF))) {
-      FrameReg = getFrameRegister(MF);
-      return FPOffset;
-    } else if (MFI->hasVarSizedObjects()) {
-      assert(hasBasePointer(MF) && "missing base pointer!");
-      // Use the base register since we have it.
-      FrameReg = BasePtr;
-    } else if (AFI->isThumb2Function()) {
-      // In Thumb2 mode, the negative offset is very limited. Try to avoid
-      // out of range references.
-      if (FPOffset >= -255 && FPOffset < 0) {
-        FrameReg = getFrameRegister(MF);
-        return FPOffset;
-      }
-    } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) {
-      // Otherwise, use SP or FP, whichever is closer to the stack slot.
-      FrameReg = getFrameRegister(MF);
-      return FPOffset;
-    }
-  }
-  // Use the base pointer if we have one.
-  if (hasBasePointer(MF))
-    FrameReg = BasePtr;
-  return Offset;
-}
-
-int
-ARMBaseRegisterInfo::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
-  unsigned FrameReg;
-  return getFrameIndexReference(MF, FI, FrameReg);
-}
-
 unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
   llvm_unreachable("What is the exception register");
   return 0;
@@ -1320,7 +816,7 @@ emitLoadConstPool(MachineBasicBlock &MBB,
   BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
     .addReg(DestReg, getDefRegState(true), SubIdx)
     .addConstantPoolIndex(Idx)
-    .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+    .addImm(0).addImm(Pred).addReg(PredReg);
 }
 
 bool ARMBaseRegisterInfo::
@@ -1338,34 +834,6 @@ requiresVirtualBaseRegisters(const MachineFunction &MF) const {
   return EnableLocalStackAlloc;
 }
 
-// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
-// not required, we reserve argument space for call sites in the function
-// immediately on entry to the current function. This eliminates the need for
-// add/sub sp brackets around call sites. Returns true if the call frame is
-// included as part of the stack frame.
-bool ARMBaseRegisterInfo::
-hasReservedCallFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  unsigned CFSize = FFI->getMaxCallFrameSize();
-  // It's not always a good idea to include the call frame as part of the
-  // stack frame. ARM (especially Thumb) has small immediate offset to
-  // address the stack frame. So a large call frame can cause poor codegen
-  // and may even makes it impossible to scavenge a register.
-  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
-    return false;
-
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-// canSimplifyCallFramePseudos - If there is a reserved call frame, the
-// call frame pseudos can be simplified. Unlike most targets, having a FP
-// is not sufficient here since we still may reference some objects via SP
-// even when FP is available in Thumb2 mode.
-bool ARMBaseRegisterInfo::
-canSimplifyCallFramePseudos(const MachineFunction &MF) const {
-  return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
-}
-
 static void
 emitSPUpdate(bool isARM,
              MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
@@ -1384,7 +852,8 @@ emitSPUpdate(bool isARM,
 void ARMBaseRegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  if (!hasReservedCallFrame(MF)) {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
     // ADJCALLSTACKUP   -> add, sp, sp, amount
@@ -1395,7 +864,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      unsigned Align = TFI->getStackAlignment();
       Amount = (Amount+Align-1)/Align*Align;
 
       ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -1433,8 +902,7 @@ getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
   switch (AddrMode) {
   case ARMII::AddrModeT2_i8:
   case ARMII::AddrModeT2_i12:
-    // i8 supports only negative, and i12 supports only positive, so
-    // based on Offset sign, consider the appropriate instruction
+  case ARMII::AddrMode_i12:
     InstrOffs = MI->getOperand(Idx+1).getImm();
     Scale = 1;
     break;
@@ -1496,8 +964,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   // return false for everything else.
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
-  case ARM::LDR: case ARM::LDRH: case ARM::LDRB:
-  case ARM::STR: case ARM::STRH: case ARM::STRB:
+  case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12:
+  case ARM::STRi12: case ARM::STRH: case ARM::STRBi12:
   case ARM::t2LDRi12: case ARM::t2LDRi8:
   case ARM::t2STRi12: case ARM::t2STRi8:
   case ARM::VLDRS: case ARM::VLDRD:
@@ -1516,6 +984,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -1542,8 +1011,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   // The FP is only available if there is no dynamic realignment. We
   // don't know for sure yet whether we'll need that, so we guess based
   // on whether there are any local variables that would trigger it.
-  unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
-  if (hasFP(MF) &&
+  unsigned StackAlign = TFI->getStackAlignment();
+  if (TFI->hasFP(MF) &&
       !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
     if (isFrameOffsetLegal(MI, FPOffset))
       return false;
@@ -1560,19 +1029,25 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   return true;
 }
 
-/// materializeFrameBaseRegister - Insert defining instruction(s) for
-/// BaseReg to be a pointer to FrameIdx before insertion point I.
+/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
 void ARMBaseRegisterInfo::
-materializeFrameBaseRegister(MachineBasicBlock::iterator I, unsigned BaseReg,
-                             int FrameIdx, int64_t Offset) const {
-  ARMFunctionInfo *AFI =
-    I->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                             unsigned BaseReg, int FrameIdx,
+                             int64_t Offset) const {
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
   unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
     (AFI->isThumb1OnlyFunction() ? ARM::tADDrSPi : ARM::t2ADDri);
 
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL;                  // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
   MachineInstrBuilder MIB =
-    BuildMI(*I->getParent(), I, I->getDebugLoc(), TII.get(ADDriOpc), BaseReg)
+    BuildMI(*MBB, Ins, DL, TII.get(ADDriOpc), BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
+
   if (!AFI->isThumb1OnlyFunction())
     AddDefaultCC(AddDefaultPred(MIB));
 }
@@ -1640,6 +1115,7 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
     NumBits = 8;
     Scale = 4;
     break;
+  case ARMII::AddrMode_i12:
   case ARMII::AddrMode2:
     NumBits = 12;
     break;
@@ -1679,6 +1155,8 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const ARMFrameLowering *TFI =
+    static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This eliminateFrameIndex does not support Thumb1!");
@@ -1691,7 +1169,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = MI.getOperand(i).getIndex();
   unsigned FrameReg;
 
-  int Offset = ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
+  int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
 
   // Special handling of dbg_value instructions.
   if (MI.isDebugValue()) {
@@ -1737,339 +1215,13 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
                              Offset, Pred, PredReg, TII);
     }
+    // Update the original instruction to use the scratch register.
     MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+    if (MI.getOpcode() == ARM::t2ADDrSPi)
+      MI.setDesc(TII.get(ARM::t2ADDri));
+    else if (MI.getOpcode() == ARM::t2SUBrSPi)
+      MI.setDesc(TII.get(ARM::t2SUBri));
   }
 }
 
-/// Move iterator past the next bunch of callee save load / store ops for
-/// the particular spill area (1: integer area 1, 2: integer area 2,
-/// 3: fp area, 0: don't care).
-static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator &MBBI,
-                                   int Opc1, int Opc2, unsigned Area,
-                                   const ARMSubtarget &STI) {
-  while (MBBI != MBB.end() &&
-         ((MBBI->getOpcode() == Opc1) || (MBBI->getOpcode() == Opc2)) &&
-         MBBI->getOperand(1).isFI()) {
-    if (Area != 0) {
-      bool Done = false;
-      unsigned Category = 0;
-      switch (MBBI->getOperand(0).getReg()) {
-      case ARM::R4:  case ARM::R5:  case ARM::R6: case ARM::R7:
-      case ARM::LR:
-        Category = 1;
-        break;
-      case ARM::R8:  case ARM::R9:  case ARM::R10: case ARM::R11:
-        Category = STI.isTargetDarwin() ? 2 : 1;
-        break;
-      case ARM::D8:  case ARM::D9:  case ARM::D10: case ARM::D11:
-      case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
-        Category = 3;
-        break;
-      default:
-        Done = true;
-        break;
-      }
-      if (Done || Category != Area)
-        break;
-    }
-
-    ++MBBI;
-  }
-}
-
-void ARMBaseRegisterInfo::
-emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineFrameInfo  *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  assert(!AFI->isThumb1OnlyFunction() &&
-         "This emitPrologue does not support Thumb1!");
-  bool isARM = !AFI->isThumbFunction();
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  unsigned NumBytes = MFI->getStackSize();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
-
-  // Determine the sizes of each callee-save spill areas and record which frame
-  // belongs to which callee-save spill areas.
-  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
-  int FramePtrSpillFI = 0;
-
-  // Allocate the vararg register save area. This is not counted in NumBytes.
-  if (VARegSaveSize)
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize);
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
-    return;
-  }
-
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    int FI = CSI[i].getFrameIdx();
-    switch (Reg) {
-    case ARM::R4:
-    case ARM::R5:
-    case ARM::R6:
-    case ARM::R7:
-    case ARM::LR:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
-      GPRCS1Size += 4;
-      break;
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      if (STI.isTargetDarwin()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
-        GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
-        GPRCS1Size += 4;
-      }
-      break;
-    default:
-      AFI->addDPRCalleeSavedAreaFrame(FI);
-      DPRCSSize += 8;
-    }
-  }
-
-  // Build the new SUBri to adjust SP for integer callee-save spill area 1.
-  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS1Size);
-  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 1, STI);
-
-  // Set FP to point to the stack slot that contains the previous FP.
-  // For Darwin, FP is R7, which has now been stored in spill area 1.
-  // Otherwise, if this is not Darwin, all the callee-saved registers go
-  // into spill area 1, including the FP in R11.  In either case, it is
-  // now safe to emit this assignment.
-  bool HasFP = hasFP(MF);
-  if (HasFP) {
-    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0);
-    AddDefaultCC(AddDefaultPred(MIB));
-  }
-
-  // Build the new SUBri to adjust SP for integer callee-save spill area 2.
-  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS2Size);
-
-  // Build the new SUBri to adjust SP for FP callee-save spill area.
-  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 2, STI);
-  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRCSSize);
-
-  // Determine starting offsets of spill areas.
-  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
-  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
-  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  if (HasFP)
-    AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
-                                NumBytes);
-  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
-  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
-  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
-
-  movePastCSLoadStoreOps(MBB, MBBI, ARM::VSTRD, 0, 3, STI);
-  NumBytes = DPRCSOffset;
-  if (NumBytes) {
-    // Adjust SP after all the callee-save spills.
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
-    if (HasFP)
-      AFI->setShouldRestoreSPFromFP(true);
-  }
-
-  if (STI.isTargetELF() && hasFP(MF)) {
-    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
-                             AFI->getFramePtrSpillOffset());
-    AFI->setShouldRestoreSPFromFP(true);
-  }
-
-  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
-  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
-  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
-
-  // If we need dynamic stack realignment, do it here. Be paranoid and make
-  // sure if we also have VLAs, we have a base pointer for frame access.
-  if (needsStackRealignment(MF)) {
-    unsigned MaxAlign = MFI->getMaxAlignment();
-    assert (!AFI->isThumb1OnlyFunction());
-    if (!AFI->isThumbFunction()) {
-      // Emit bic sp, sp, MaxAlign
-      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
-                                          TII.get(ARM::BICri), ARM::SP)
-                                  .addReg(ARM::SP, RegState::Kill)
-                                  .addImm(MaxAlign-1)));
-    } else {
-      // We cannot use sp as source/dest register here, thus we're emitting the
-      // following sequence:
-      // mov r4, sp
-      // bic r4, r4, MaxAlign
-      // mov sp, r4
-      // FIXME: It will be better just to find spare register here.
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2tgpr), ARM::R4)
-        .addReg(ARM::SP, RegState::Kill);
-      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
-                                          TII.get(ARM::t2BICri), ARM::R4)
-                                  .addReg(ARM::R4, RegState::Kill)
-                                  .addImm(MaxAlign-1)));
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
-        .addReg(ARM::R4, RegState::Kill);
-    }
-
-    AFI->setShouldRestoreSPFromFP(true);
-  }
-
-  // If we need a base pointer, set it up here. It's whatever the value
-  // of the stack pointer is at this point. Any variable size objects
-  // will be allocated after this, so we can still use the base pointer
-  // to reference locals.
-  if (hasBasePointer(MF)) {
-    if (isARM)
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), BasePtr)
-        .addReg(ARM::SP)
-        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
-    else
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr)
-        .addReg(ARM::SP);
-  }
-
-  // If the frame has variable sized objects then the epilogue must restore
-  // the sp from fp.
-  if (!AFI->shouldRestoreSPFromFP() && MFI->hasVarSizedObjects())
-    AFI->setShouldRestoreSPFromFP(true);
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI,
-                        const ARMBaseInstrInfo &TII,
-                        const unsigned *CSRegs) {
-  return ((MI->getOpcode() == (int)ARM::VLDRD ||
-           MI->getOpcode() == (int)ARM::LDR ||
-           MI->getOpcode() == (int)ARM::t2LDRi12) &&
-          MI->getOperand(1).isFI() &&
-          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
-}
-
-void ARMBaseRegisterInfo::
-emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = prior(MBB.end());
-  assert(MBBI->getDesc().isReturn() &&
-         "Can only insert epilog into returning blocks");
-  unsigned RetOpcode = MBBI->getOpcode();
-  DebugLoc dl = MBBI->getDebugLoc();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  assert(!AFI->isThumb1OnlyFunction() &&
-         "This emitEpilogue does not support Thumb1!");
-  bool isARM = !AFI->isThumbFunction();
-
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  int NumBytes = (int)MFI->getStackSize();
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
-  } else {
-    // Unwind MBBI to point to first LDR / VLDRD.
-    const unsigned *CSRegs = getCalleeSavedRegs();
-    if (MBBI != MBB.begin()) {
-      do
-        --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
-      if (!isCSRestore(MBBI, TII, CSRegs))
-        ++MBBI;
-    }
-
-    // Move SP to start of FP callee save spill area.
-    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
-                 AFI->getGPRCalleeSavedArea2Size() +
-                 AFI->getDPRCalleeSavedAreaSize());
-
-    // Reset SP based on frame pointer only if the stack frame extends beyond
-    // frame pointer stack slot or target is ELF and the function has FP.
-    if (AFI->shouldRestoreSPFromFP()) {
-      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
-      if (NumBytes) {
-        if (isARM)
-          emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
-                                  ARMCC::AL, 0, TII);
-        else
-          emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
-                                 ARMCC::AL, 0, TII);
-      } else {
-        // Thumb2 or ARM.
-        if (isARM)
-          BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
-            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
-        else
-          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
-            .addReg(FramePtr);
-      }
-    } else if (NumBytes)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
-
-    // Move SP to start of integer callee save spill area 2.
-    movePastCSLoadStoreOps(MBB, MBBI, ARM::VLDRD, 0, 3, STI);
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedAreaSize());
-
-    // Move SP to start of integer callee save spill area 1.
-    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 2, STI);
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea2Size());
-
-    // Move SP to SP upon entry to the function.
-    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 1, STI);
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size());
-  }
-
-  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND ||
-      RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) {
-    // Tail call return: adjust the stack pointer and jump to callee.
-    MBBI = prior(MBB.end());
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
-
-    // Jump to label or value in register.
-    if (RetOpcode == ARM::TCRETURNdi) {
-      BuildMI(MBB, MBBI, dl,
-            TII.get(STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)).
-        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
-                         JumpTarget.getTargetFlags());
-    } else if (RetOpcode == ARM::TCRETURNdiND) {
-      BuildMI(MBB, MBBI, dl,
-            TII.get(STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND)).
-        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
-                         JumpTarget.getTargetFlags());
-    } else if (RetOpcode == ARM::TCRETURNri) {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)).
-        addReg(JumpTarget.getReg(), RegState::Kill);
-    } else if (RetOpcode == ARM::TCRETURNriND) {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)).
-        addReg(JumpTarget.getReg(), RegState::Kill);
-    }
-
-    MachineInstr *NewMI = prior(MBBI);
-    for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
-      NewMI->addOperand(MBBI->getOperand(i));
-
-    // Delete the pseudo instruction TCRETURN.
-    MBB.erase(MBBI);
-  }
-
-  if (VARegSaveSize)
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
-}
-
 #include "ARMGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index fa2eb6c..ba6bd2b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -44,6 +44,45 @@ static inline bool isARMLowRegister(unsigned Reg) {
   }
 }
 
+/// isARMArea1Register - Returns true if the register is a low register (r0-r7)
+/// or a stack/pc register that we should push/pop.
+static inline bool isARMArea1Register(unsigned Reg, bool isDarwin) {
+  using namespace ARM;
+  switch (Reg) {
+    case R0:  case R1:  case R2:  case R3:
+    case R4:  case R5:  case R6:  case R7:
+    case LR:  case SP:  case PC:
+      return true;
+    case R8:  case R9:  case R10: case R11:
+      // For darwin we want r7 and lr to be next to each other.
+      return !isDarwin;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea2Register(unsigned Reg, bool isDarwin) {
+  using namespace ARM;
+  switch (Reg) {
+    case R8: case R9: case R10: case R11:
+      // Darwin has this second area.
+      return isDarwin;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea3Register(unsigned Reg, bool isDarwin) {
+  using namespace ARM;
+  switch (Reg) {
+    case D15: case D14: case D13: case D12:
+    case D11: case D10: case D9:  case D8:
+      return true;
+    default:
+      return false;
+  }
+}
+
 class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 protected:
   const ARMBaseInstrInfo &TII;
@@ -65,12 +104,6 @@ protected:
   unsigned getOpcode(int Op) const;
 
 public:
-  /// getRegisterNumbering - Given the enum value for some register, e.g.
-  /// ARM::LR, return the number that it corresponds to (e.g. 14). It
-  /// also returns true in isSPVFP if the register is a single precision
-  /// VFP register.
-  static unsigned getRegisterNumbering(unsigned RegEnum, bool *isSPVFP = 0);
-
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
@@ -106,14 +139,13 @@ public:
   void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
                           MachineFunction &MF) const;
 
-  bool hasFP(const MachineFunction &MF) const;
   bool hasBasePointer(const MachineFunction &MF) const;
 
   bool canRealignStack(const MachineFunction &MF) const;
   bool needsStackRealignment(const MachineFunction &MF) const;
   int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const;
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const;
-  void materializeFrameBaseRegister(MachineBasicBlock::iterator I,
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
                                     unsigned BaseReg, int FrameIdx,
                                     int64_t Offset) const;
   void resolveFrameIndex(MachineBasicBlock::iterator I,
@@ -122,17 +154,10 @@ public:
 
   bool cannotEliminateFrame(const MachineFunction &MF) const;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS = NULL) const;
-
   // Debug information queries.
   unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const;
-  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg, int SPAdj) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  unsigned getBaseRegister() const { return BasePtr; }
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
@@ -162,9 +187,6 @@ public:
 
   virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
 
-  virtual bool hasReservedCallFrame(const MachineFunction &MF) const;
-  virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const;
-
   virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                            MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I) const;
@@ -172,12 +194,7 @@ public:
   virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
                                    int SPAdj, RegScavenger *RS = NULL) const;
 
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
 private:
-  unsigned estimateRSStackSizeLimit(MachineFunction &MF) const;
-
   unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const;
 
   unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const;
diff --git a/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h
index 3b38375..69eddf0 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains enumerations and support routines for ARM build attributes
-// as defined in ARM ABI addenda document (ABI release 2.07).
+// as defined in ARM ABI addenda document (ABI release 2.08).
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,7 +16,14 @@
 #define __TARGET_ARMBUILDATTRS_H__
 
 namespace ARMBuildAttrs {
-  enum {
+  enum SpecialAttr {
+    // This is for the .cpu asm attr. It translates into one or more
+    // AttrType (below) entries in the .ARM.attributes section in the ELF.
+    SEL_CPU 
+  };
+
+  enum AttrType {
+    // Rest correspond to ELF/.ARM.attributes
     File                      = 1,
     Section                   = 2,
     Symbol                    = 3,
@@ -52,12 +59,72 @@ namespace ARMBuildAttrs {
     CPU_unaligned_access      = 34,
     VFP_HP_extension          = 36,
     ABI_FP_16bit_format       = 38,
+    MPextension_use           = 42, // was 70, 2.08 ABI
+    DIV_use                   = 44,
     nodefaults                = 64,
     also_compatible_with      = 65,
     T2EE_use                  = 66,
     conformance               = 67,
     Virtualization_use        = 68,
-    MPextension_use           = 70
+    MPextension_use_old       = 70
+  };
+
+  // Magic numbers for .ARM.attributes
+  enum AttrMagic {
+    Format_Version  = 0x41
+  };
+
+  // Legal Values for CPU_arch, (=6), uleb128
+  enum CPUArch {
+    Pre_v4       = 0,
+    v4       = 1,   // e.g. SA110
+    v4T      = 2,   // e.g. ARM7TDMI
+    v5T      = 3,   // e.g. ARM9TDMI
+    v5TE     = 4,   // e.g. ARM946E_S
+    v5TEJ    = 5,   // e.g. ARM926EJ_S
+    v6       = 6,   // e.g. ARM1136J_S
+    v6KZ     = 7,   // e.g. ARM1176JZ_S
+    v6T2     = 8,   // e.g. ARM1156T2F_S
+    v6K      = 9,   // e.g. ARM1136J_S
+    v7       = 10,  // e.g. Cortex A8, Cortex M3
+    v6_M     = 11,  // e.g. Cortex M1
+    v6S_M    = 12,  // v6_M with the System extensions
+    v7E_M    = 13   // v7_M with DSP extensions
+  };
+
+  enum CPUArchProfile { // (=7), uleb128 
+    Not_Applicable = 0, // pre v7, or cross-profile code
+    ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8)
+    RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4)
+    MicroControllerProfile = (0x4D), // 'M' (e.g. for Cortex M3)
+    SystemProfile = (0x53) // 'S' Application or real-time profile
+  };
+
+  // The following have a lot of common use cases
+  enum { 
+    //ARMISAUse (=8), uleb128  and THUMBISAUse (=9), uleb128
+    Not_Allowed = 0,
+    Allowed = 1,
+
+    // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
+    AllowFPv2  = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA)
+    AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA)
+    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 
+    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) 
+    AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31
+
+    // Tag_WMMX_arch, (=11), uleb128
+    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
+    
+    // Tag_WMMX_arch, (=11), uleb128
+    AllowWMMXv1 = 2,  // The user permitted this entity to use WMMX v2
+
+    // Tag_ABI_FP_denormal, (=20), uleb128 
+    PreserveFPSign = 2, // sign when flushed-to-zero is preserved
+
+    // Tag_ABI_FP_number_model, (=23), uleb128
+    AllowRTABI = 2,  // numbers, infinities, and one quiet NaN (see [RTABI])
+    AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings
   };
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
new file mode 100644
index 0000000..ff7db1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -0,0 +1,160 @@
+//===-- ARMCallingConv.h - ARM Custom Calling Convention Routines ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMCALLINGCONV_H
+#define ARMCALLINGCONV_H
+
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "ARM.h"
+
+namespace llvm {
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          CCState &State, bool CanFail) {
+  static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  // Try to get the first register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else {
+    // For the 2nd half of a v2f64, do not fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  // Try to get the second register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4, 4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State) {
+  if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                           CCValAssign::LocInfo &LocInfo,
+                           CCState &State, bool CanFail) {
+  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
+  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+  static const unsigned ShadowRegList[] = { ARM::R0, ARM::R1 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
+  if (Reg == 0) {
+    // For the 2nd half of a v2f64, do not just fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 8),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  unsigned T = State.AllocateReg(LoRegList[i]);
+  (void)T;
+  assert(T == LoRegList[i] && "Could not allocate register");
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                         CCValAssign::LocInfo &LocInfo, CCState &State) {
+  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
+  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  if (Reg == 0)
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  return true;  // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
+  return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                   State);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
index 293e32a..426ba13 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -53,6 +53,34 @@ def RetCC_ARM_APCS : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+def FastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<CC_ARM_APCS>
+]>;
+
+def RetFastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_APCS>
+]>;
+
+
+//===----------------------------------------------------------------------===//
 // ARM AAPCS (EABI) Calling Convention, common parts
 //===----------------------------------------------------------------------===//
 
@@ -105,6 +133,7 @@ def RetCC_ARM_AAPCS : CallingConv<[
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS-VFP (EABI) Calling Convention
+// Also used for FastCC (when VFP2 or later is available)
 //===----------------------------------------------------------------------===//
 
 def CC_ARM_AAPCS_VFP : CallingConv<[
diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp
index b1a702f..9bbf6a0 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -74,7 +74,7 @@ namespace {
     /// getBinaryCodeForInstr - This function, generated by the
     /// CodeEmitterGenerator using TableGen, produces the binary encoding for
     /// machine instructions.
-    unsigned getBinaryCodeForInstr(const MachineInstr &MI);
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI) const;
 
     bool runOnMachineFunction(MachineFunction &MF);
 
@@ -101,7 +101,6 @@ namespace {
                                     unsigned OpIdx);
 
     unsigned getMachineSoImmOpValue(unsigned SoImm);
-
     unsigned getAddrModeSBit(const MachineInstr &MI,
                              const TargetInstrDesc &TID) const;
 
@@ -140,8 +139,6 @@ namespace {
 
     void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI);
 
-    void emitMiscInstruction(const MachineInstr &MI);
-
     void emitNEONLaneInstruction(const MachineInstr &MI);
     void emitNEONDupInstruction(const MachineInstr &MI);
     void emitNEON1RegModImmInstruction(const MachineInstr &MI);
@@ -150,20 +147,176 @@ namespace {
 
     /// getMachineOpValue - Return binary encoding of operand. If the machine
     /// operand requires relocation, record the relocation and return zero.
-    unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO);
-    unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) {
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO) const;
+    unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) const {
       return getMachineOpValue(MI, MI.getOperand(OpIdx));
     }
 
+    // FIXME: The legacy JIT ARMCodeEmitter doesn't rely on the the
+    //  TableGen'erated getBinaryCodeForInstr() function to encode any
+    //  operand values, instead querying getMachineOpValue() directly for
+    //  each operand it needs to encode. Thus, any of the new encoder
+    //  helper functions can simply return 0 as the values the return
+    //  are already handled elsewhere. They are placeholders to allow this
+    //  encoder to continue to function until the MC encoder is sufficiently
+    //  far along that this one can be eliminated entirely.
+    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val) 
+      const { return 0; }
+    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val) 
+      const { return 0; }
+    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val) 
+      const { return 0; }
+    unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
+      const { return 0; }
+    unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBLTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBRTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBCCTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbCBTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getUnconditionalBranchTargetOpValue(const MachineInstr &MI,
+      unsigned Op) const { return 0; }
+    unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSORegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm12OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm12OffsetOpValue(const MachineInstr &MI,unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeSORegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getRotImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getImmMinusOneOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI,
+                                            unsigned Op) const { return 0; }
+    unsigned getMsbOpValue(const MachineInstr &MI,
+                           unsigned Op) const { return 0; }
+    uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const {return 0; }
+    uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0; }
+
+    unsigned getAddrModeImm12OpValue(const MachineInstr &MI, unsigned Op)
+      const {
+      // {17-13} = reg
+      // {12}    = (U)nsigned (add == '1', sub == '0')
+      // {11-0}  = imm12
+      const MachineOperand &MO  = MI.getOperand(Op);
+      const MachineOperand &MO1 = MI.getOperand(Op + 1);
+      if (!MO.isReg()) {
+        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
+        return 0;
+      }
+      unsigned Reg = getARMRegisterNumbering(MO.getReg());
+      int32_t Imm12 = MO1.getImm();
+      uint32_t Binary;
+      Binary = Imm12 & 0xfff;
+      if (Imm12 >= 0)
+        Binary |= (1 << 12);
+      Binary |= (Reg << 13);
+      return Binary;
+    }
+
+    unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op) const {
+      return 0;
+    }
+
+    uint32_t getAddrMode2OpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeThumbSPOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeSOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeISOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModePCOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrMode5OpValue(const MachineInstr &MI, unsigned Op) const {
+      // {17-13} = reg
+      // {12}    = (U)nsigned (add == '1', sub == '0')
+      // {11-0}  = imm12
+      const MachineOperand &MO  = MI.getOperand(Op);
+      const MachineOperand &MO1 = MI.getOperand(Op + 1);
+      if (!MO.isReg()) {
+        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
+        return 0;
+      }
+      unsigned Reg = getARMRegisterNumbering(MO.getReg());
+      int32_t Imm12 = MO1.getImm();
+
+      // Special value for #-0
+      if (Imm12 == INT32_MIN)
+        Imm12 = 0;
+
+      // Immediate is always encoded as positive. The 'U' bit controls add vs
+      // sub.
+      bool isAdd = true;
+      if (Imm12 < 0) {
+        Imm12 = -Imm12;
+        isAdd = false;
+      }
+
+      uint32_t Binary = Imm12 & 0xfff;
+      if (isAdd)
+        Binary |= (1 << 12);
+      Binary |= (Reg << 13);
+      return Binary;
+    }
+    unsigned getNEONVcvtImm32OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
+    unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
     /// getMovi32Value - Return binary encoding of operand for movw/movt. If the
     /// machine operand requires relocation, record the relocation and return
     /// zero.
     unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO,
                             unsigned Reloc);
-    unsigned getMovi32Value(const MachineInstr &MI, unsigned OpIdx,
-                            unsigned Reloc) {
-      return getMovi32Value(MI, MI.getOperand(OpIdx), Reloc);
-    }
 
     /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
     ///
@@ -173,12 +326,12 @@ namespace {
     /// fixed up by the relocation stage.
     void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
                            bool MayNeedFarStub,  bool Indirect,
-                           intptr_t ACPV = 0);
-    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
-    void emitConstPoolAddress(unsigned CPI, unsigned Reloc);
-    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc);
+                           intptr_t ACPV = 0) const;
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
     void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc,
-                               intptr_t JTBase = 0);
+                               intptr_t JTBase = 0) const;
   };
 }
 
@@ -266,9 +419,9 @@ unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI,
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
-                                           const MachineOperand &MO) {
+                                           const MachineOperand &MO) const {
   if (MO.isReg())
-    return ARMRegisterInfo::getRegisterNumbering(MO.getReg());
+    return getARMRegisterNumbering(MO.getReg());
   else if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
   else if (MO.isGlobal())
@@ -285,12 +438,8 @@ unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
     emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative);
   else if (MO.isMBB())
     emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
-  else {
-#ifndef NDEBUG
-    errs() << MO;
-#endif
-    llvm_unreachable(0);
-  }
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
   return 0;
 }
 
@@ -298,7 +447,7 @@ unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
 ///
 void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
                                        bool MayNeedFarStub, bool Indirect,
-                                       intptr_t ACPV) {
+                                       intptr_t ACPV) const {
   MachineRelocation MR = Indirect
     ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
                                            const_cast<GlobalValue *>(GV),
@@ -312,7 +461,8 @@ void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
 /// emitExternalSymbolAddress - Arrange for the address of an external symbol to
 /// be emitted to the current location in the function, and allow it to be PC
 /// relative.
-void ARMCodeEmitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) {
+void ARMCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
   MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
                                                  Reloc, ES));
 }
@@ -320,7 +470,7 @@ void ARMCodeEmitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) {
 /// emitConstPoolAddress - Arrange for the address of an constant pool
 /// to be emitted to the current location in the function, and allow it to be PC
 /// relative.
-void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) {
+void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
   // Tell JIT emitter we'll resolve the address.
   MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
                                                     Reloc, CPI, 0, true));
@@ -329,14 +479,16 @@ void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) {
 /// emitJumpTableAddress - Arrange for the address of a jump table to
 /// be emitted to the current location in the function, and allow it to be PC
 /// relative.
-void ARMCodeEmitter::emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) {
+void ARMCodeEmitter::
+emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
   MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
                                                     Reloc, JTIndex, 0, true));
 }
 
 /// emitMachineBasicBlock - Emit the specified address basic block.
 void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
-                                           unsigned Reloc, intptr_t JTBase) {
+                                           unsigned Reloc,
+                                           intptr_t JTBase) const {
   MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
                                              Reloc, BB, JTBase));
 }
@@ -364,6 +516,14 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
     llvm_unreachable("Unhandled instruction encoding format!");
     break;
   }
+  case ARMII::MiscFrm:
+    if (MI.getOpcode() == ARM::LEApcrelJT) {
+      // Materialize jumptable address.
+      emitLEApcrelJTInstruction(MI);
+      break;
+    }
+    llvm_unreachable("Unhandled instruction encoding!");
+    break;
   case ARMII::Pseudo:
     emitPseudoInstruction(MI);
     break;
@@ -418,9 +578,7 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
   case ARMII::VFPLdStMulFrm:
     emitVFPLoadStoreMultipleInstruction(MI);
     break;
-  case ARMII::VFPMiscFrm:
-    emitMiscInstruction(MI);
-    break;
+
   // NEON instructions.
   case ARMII::NGetLnFrm:
   case ARMII::NSetLnFrm:
@@ -488,7 +646,7 @@ void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
       emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
       emitWordLE(0);
     } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-      uint32_t Val = *(uint32_t*)CI->getValue().getRawData();
+      uint32_t Val = uint32_t(*CI->getValue().getRawData());
       emitWordLE(Val);
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
       if (CFP->getType()->isFloatTy())
@@ -588,7 +746,7 @@ void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
   const TargetInstrDesc &TID = MI.getDesc();
 
   // Emit the 'add' instruction.
-  unsigned Binary = 0x4 << 21;  // add: Insts{24-31} = 0b0100
+  unsigned Binary = 0x4 << 21;  // add: Insts{24-21} = 0b0100
 
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
@@ -600,7 +758,7 @@ void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
   Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
 
   // Encode Rn which is PC.
-  Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+  Binary |= getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
 
   // Encode the displacement.
   Binary |= 1 << ARMII::I_BitShift;
@@ -628,7 +786,7 @@ void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) {
   // Encode the shift operation.
   switch (Opcode) {
   default: break;
-  case ARM::MOVrx:
+  case ARM::RRX:
     // rrx
     Binary |= 0x6 << 4;
     break;
@@ -659,10 +817,10 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
   switch (Opcode) {
   default:
     llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
-  case ARM::BX:
-  case ARM::BMOVPCRX:
-  case ARM::BXr9:
-  case ARM::BMOVPCRXr9: {
+  case ARM::BX_CALL:
+  case ARM::BMOVPCRX_CALL:
+  case ARM::BXr9_CALL:
+  case ARM::BMOVPCRXr9_CALL: {
     // First emit mov lr, pc
     unsigned Binary = 0x01a0e00f;
     Binary |= II->getPredicate(&MI) << ARMII::CondShift;
@@ -720,18 +878,18 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
   }
 
   case ARM::MOVi32imm:
-    emitMOVi32immInstruction(MI);
-    break;
-
-  case ARM::MOVi2pieces:
     // Two instructions to materialize a constant.
-    emitMOVi2piecesInstruction(MI);
+    if (Subtarget->hasV6T2Ops())
+      emitMOVi32immInstruction(MI);
+    else
+      emitMOVi2piecesInstruction(MI);
     break;
+
   case ARM::LEApcrelJT:
     // Materialize jumptable address.
     emitLEApcrelJTInstruction(MI);
     break;
-  case ARM::MOVrx:
+  case ARM::RRX:
   case ARM::MOVsrl_flag:
   case ARM::MOVsra_flag:
     emitPseudoMoveInstruction(MI);
@@ -789,8 +947,7 @@ unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI,
   if (Rs) {
     // Encode Rs bit[11:8].
     assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-    return Binary |
-      (ARMRegisterInfo::getRegisterNumbering(Rs) << ARMII::RegRsShift);
+    return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
   }
 
   // Encode shift_imm bit[11:7].
@@ -841,8 +998,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
   else if (ImplicitRd)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd)
-               << ARMII::RegRdShift);
+    Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift);
 
   if (TID.Opcode == ARM::MOVi16) {
       // Get immediate from MI.
@@ -892,8 +1048,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
   if (!isUnary) {
     if (ImplicitRn)
       // Special handling for implicit use (e.g. PC).
-      Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
-                 << ARMII::RegRnShift);
+      Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift);
     else {
       Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift;
       ++OpIdx;
@@ -910,7 +1065,7 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
 
   if (MO.isReg()) {
     // Encode register Rm.
-    emitWordLE(Binary | ARMRegisterInfo::getRegisterNumbering(MO.getReg()));
+    emitWordLE(Binary | getARMRegisterNumbering(MO.getReg()));
     return;
   }
 
@@ -930,6 +1085,13 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
   // Part of binary is determined by TableGn.
   unsigned Binary = getBinaryCodeForInstr(MI);
 
+  // If this is an LDRi12, STRi12 or LDRcp, nothing more needs be done.
+  if (MI.getOpcode() == ARM::LDRi12 || MI.getOpcode() == ARM::LDRcp ||
+      MI.getOpcode() == ARM::STRi12) {
+    emitWordLE(Binary);
+    return;
+  }
+
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
@@ -946,16 +1108,14 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
   // Set first operand
   if (ImplicitRd)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd)
-               << ARMII::RegRdShift);
+    Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift);
   else
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
 
   // Set second operand
   if (ImplicitRn)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
-               << ARMII::RegRnShift);
+    Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift);
   else
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
 
@@ -978,11 +1138,11 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
     return;
   }
 
-  // Set bit I(25), because this is not in immediate enconding.
+  // Set bit I(25), because this is not in immediate encoding.
   Binary |= 1 << ARMII::I_BitShift;
   assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
   // Set bit[3:0] to the corresponding Rm register
-  Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg());
+  Binary |= getARMRegisterNumbering(MO2.getReg());
 
   // If this instr is in scaled register offset/index instruction, set
   // shift_immed(bit[11:7]) and shift(bit[6:5]) fields.
@@ -1026,8 +1186,7 @@ void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
   // Set second operand
   if (ImplicitRn)
     // Special handling for implicit use (e.g. PC).
-    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
-               << ARMII::RegRnShift);
+    Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift);
   else
     Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
 
@@ -1046,7 +1205,7 @@ void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
   // If this instr is in register offset/index encoding, set bit[3:0]
   // to the corresponding Rm register.
   if (MO2.getReg()) {
-    Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg());
+    Binary |= getARMRegisterNumbering(MO2.getReg());
     emitWordLE(Binary);
     return;
   }
@@ -1100,8 +1259,8 @@ void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) {
   Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
 
   // Set addressing mode by modifying bits U(23) and P(24)
-  const MachineOperand &MO = MI.getOperand(OpIdx++);
-  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm()));
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
 
   // Set bit W(21)
   if (IsUpdating)
@@ -1112,7 +1271,7 @@ void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       break;
-    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(MO.getReg());
+    unsigned RegNum = getARMRegisterNumbering(MO.getReg());
     assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
            RegNum < 16);
     Binary |= 0x1 << RegNum;
@@ -1349,7 +1508,7 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
 
   if (TID.Opcode == ARM::BX_RET || TID.Opcode == ARM::MOVPCLR)
     // The return register is LR.
-    Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR);
+    Binary |= getARMRegisterNumbering(ARM::LR);
   else
     // otherwise, set the return register
     Binary |= getMachineOpValue(MI, 0);
@@ -1360,8 +1519,8 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
 static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegD = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = false;
-  RegD = ARMRegisterInfo::getRegisterNumbering(RegD, &isSPVFP);
+  bool isSPVFP = ARM::SPRRegisterClass->contains(RegD);
+  RegD = getARMRegisterNumbering(RegD);
   if (!isSPVFP)
     Binary |=   RegD               << ARMII::RegRdShift;
   else {
@@ -1374,8 +1533,8 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
 static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegN = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = false;
-  RegN = ARMRegisterInfo::getRegisterNumbering(RegN, &isSPVFP);
+  bool isSPVFP = ARM::SPRRegisterClass->contains(RegN);
+  RegN = getARMRegisterNumbering(RegN);
   if (!isSPVFP)
     Binary |=   RegN               << ARMII::RegRnShift;
   else {
@@ -1388,8 +1547,8 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
 static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegM = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = false;
-  RegM = ARMRegisterInfo::getRegisterNumbering(RegM, &isSPVFP);
+  bool isSPVFP = ARM::SPRRegisterClass->contains(RegM);
+  RegM = getARMRegisterNumbering(RegM);
   if (!isSPVFP)
     Binary |=   RegM;
   else {
@@ -1548,8 +1707,8 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
   Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
 
   // Set addressing mode by modifying bits U(23) and P(24)
-  const MachineOperand &MO = MI.getOperand(OpIdx++);
-  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm()));
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
 
   // Set bit W(21)
   if (IsUpdating)
@@ -1576,63 +1735,10 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
   emitWordLE(Binary);
 }
 
-void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) {
-  unsigned Opcode = MI.getDesc().Opcode;
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  switch(Opcode) {
-  default:
-    llvm_unreachable("ARMCodeEmitter::emitMiscInstruction");
-
-  case ARM::FMSTAT:
-    // No further encoding needed.
-    break;
-
-  case ARM::VMRS:
-  case ARM::VMSR: {
-    const MachineOperand &MO0 = MI.getOperand(0);
-    // Encode Rt.
-    Binary |= ARMRegisterInfo::getRegisterNumbering(MO0.getReg())
-                << ARMII::RegRdShift;
-    break;
-  }
-
-  case ARM::FCONSTD:
-  case ARM::FCONSTS: {
-    // Encode Dd / Sd.
-    Binary |= encodeVFPRd(MI, 0);
-
-    // Encode imm., Table A7-18 VFP modified immediate constants
-    const MachineOperand &MO1 = MI.getOperand(1);
-    unsigned Imm = static_cast<unsigned>(MO1.getFPImm()->getValueAPF()
-                      .bitcastToAPInt().getHiBits(32).getLimitedValue());
-    unsigned ModifiedImm;
-
-    if(Opcode == ARM::FCONSTS)
-      ModifiedImm = (Imm & 0x80000000) >> 24 | // a
-                    (Imm & 0x03F80000) >> 19;  // bcdefgh
-    else // Opcode == ARM::FCONSTD
-      ModifiedImm = (Imm & 0x80000000) >> 24 | // a
-                    (Imm & 0x007F0000) >> 16;  // bcdefgh
-
-    // Insts{19-16} = abcd, Insts{3-0} = efgh
-    Binary |= ((ModifiedImm & 0xF0) >> 4) << 16;
-    Binary |= (ModifiedImm & 0xF);
-    break;
-  }
-  }
-
-  emitWordLE(Binary);
-}
-
 static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegD = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  RegD = ARMRegisterInfo::getRegisterNumbering(RegD);
+  RegD = getARMRegisterNumbering(RegD);
   Binary |= (RegD & 0xf) << ARMII::RegRdShift;
   Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift;
   return Binary;
@@ -1641,7 +1747,7 @@ static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) {
 static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegN = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  RegN = ARMRegisterInfo::getRegisterNumbering(RegN);
+  RegN = getARMRegisterNumbering(RegN);
   Binary |= (RegN & 0xf) << ARMII::RegRnShift;
   Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift;
   return Binary;
@@ -1650,7 +1756,7 @@ static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) {
 static unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegM = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  RegM = ARMRegisterInfo::getRegisterNumbering(RegM);
+  RegM = getARMRegisterNumbering(RegM);
   Binary |= (RegM & 0xf);
   Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift;
   return Binary;
@@ -1684,7 +1790,7 @@ void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) {
   Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
 
   unsigned RegT = MI.getOperand(RegTOpIdx).getReg();
-  RegT = ARMRegisterInfo::getRegisterNumbering(RegT);
+  RegT = getARMRegisterNumbering(RegT);
   Binary |= (RegT << ARMII::RegRdShift);
   Binary |= encodeNEONRn(MI, RegNOpIdx);
 
@@ -1713,7 +1819,7 @@ void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) {
   Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
 
   unsigned RegT = MI.getOperand(1).getReg();
-  RegT = ARMRegisterInfo::getRegisterNumbering(RegT);
+  RegT = getARMRegisterNumbering(RegT);
   Binary |= (RegT << ARMII::RegRdShift);
   Binary |= encodeNEONRn(MI, 0);
   emitWordLE(Binary);
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 60e923b..13d1b33 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1,4 +1,4 @@
-//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===//
+//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -316,7 +316,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   }
 
   /// The next UID to take is the first unused one.
-  AFI->initConstPoolEntryUId(CPEMIs.size());
+  AFI->initPICLabelUId(CPEMIs.size());
 
   // Do the initial scan of the function, building up information about the
   // sizes of each block, the location of all the water, and finding all of the
@@ -327,7 +327,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
 
 
   /// Remove dead constant pool entries.
-  RemoveUnusedCPEntries();
+  MadeChange |= RemoveUnusedCPEntries();
 
   // Iteratively place constant pool entries and fix up branches until there
   // is no change.
@@ -368,6 +368,14 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
     MadeChange |= UndoLRSpillRestore();
 
+  // Save the mapping between original and cloned constpool entries.
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+    for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
+      const CPEntry & CPE = CPEntries[i][j];
+      AFI->recordCPEClone(i, CPE.CPI);
+    }
+  }
+
   DEBUG(errs() << '\n'; dumpBBs());
 
   BBSizes.clear();
@@ -482,7 +490,7 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
         HasInlineAsm = true;
   }
 
-  // Now go back through the instructions and build up our data structures
+  // Now go back through the instructions and build up our data structures.
   unsigned Offset = 0;
   for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
        MBBI != E; ++MBBI) {
@@ -603,7 +611,7 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
             Scale = 4;
             break;
 
-          case ARM::LDR:
+          case ARM::LDRi12:
           case ARM::LDRcp:
           case ARM::t2LDRpci:
             Bits = 12;  // +-offset_12
@@ -611,7 +619,6 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
             break;
 
           case ARM::tLDRpci:
-          case ARM::tLDRcp:
             Bits = 8;
             Scale = 4;  // +(offset_8*4)
             break;
@@ -692,7 +699,7 @@ static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
 /// machine function, it upsets all of the block numbers.  Renumber the blocks
 /// and update the arrays that parallel this numbering.
 void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
-  // Renumber the MBB's to keep them consequtive.
+  // Renumber the MBB's to keep them consecutive.
   NewBB->getParent()->RenumberBlocks(NewBB);
 
   // Insert a size into BBSizes to align it properly with the (newly
@@ -1242,7 +1249,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
 
   // No existing clone of this CPE is within range.
   // We will be generating a new clone.  Get a UID for it.
-  unsigned ID = AFI->createConstPoolEntryUId();
+  unsigned ID = AFI->createPICLabelUId();
 
   // Look for water where we can place this CPE.
   MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
@@ -1644,7 +1651,7 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
     unsigned DestOffset = BBOffsets[DestBB->getNumber()];
     if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
       MachineBasicBlock::iterator CmpMI = Br.MI; --CmpMI;
-      if (CmpMI->getOpcode() == ARM::tCMPzi8) {
+      if (CmpMI->getOpcode() == ARM::tCMPi8) {
         unsigned Reg = CmpMI->getOperand(0).getReg();
         Pred = llvm::getInstrPredicate(CmpMI, PredReg);
         if (Pred == ARMCC::AL &&
@@ -1766,7 +1773,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
       if (!OptOk)
         continue;
 
-      unsigned Opc = ByteOk ? ARM::t2TBB : ARM::t2TBH;
+      unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
       MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
         .addReg(IdxReg, getKillRegState(IdxRegKill))
         .addJumpTableIndex(JTI, JTOP.getTargetFlags())
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
index f13ccc6..165a1d8 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 ARMConstantPoolValue::ARMConstantPoolValue(const Constant *cval, unsigned id,
                                            ARMCP::ARMCPKind K,
                                            unsigned char PCAdj,
-                                           const char *Modif,
+                                           ARMCP::ARMCPModifier Modif,
                                            bool AddCA)
   : MachineConstantPoolValue((const Type*)cval->getType()),
     CVal(cval), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj),
@@ -33,17 +33,17 @@ ARMConstantPoolValue::ARMConstantPoolValue(const Constant *cval, unsigned id,
 ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C,
                                            const char *s, unsigned id,
                                            unsigned char PCAdj,
-                                           const char *Modif,
+                                           ARMCP::ARMCPModifier Modif,
                                            bool AddCA)
   : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)),
     CVal(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPExtSymbol),
     PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {}
 
 ARMConstantPoolValue::ARMConstantPoolValue(const GlobalValue *gv,
-                                           const char *Modif)
+                                           ARMCP::ARMCPModifier Modif)
   : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())),
     CVal(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0),
-    Modifier(Modif) {}
+    Modifier(Modif), AddCurrentAddress(false) {}
 
 const GlobalValue *ARMConstantPoolValue::getGV() const {
   return dyn_cast_or_null<GlobalValue>(CVal);
@@ -53,6 +53,14 @@ const BlockAddress *ARMConstantPoolValue::getBlockAddress() const {
   return dyn_cast_or_null<BlockAddress>(CVal);
 }
 
+static bool CPV_streq(const char *S1, const char *S2) {
+  if (S1 == S2)
+    return true;
+  if (S1 && S2 && strcmp(S1, S2) == 0)
+    return true;
+  return false;
+}
+
 int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
                                                     unsigned Alignment) {
   unsigned AlignMask = Alignment - 1;
@@ -65,8 +73,8 @@ int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
       if (CPV->CVal == CVal &&
           CPV->LabelId == LabelId &&
           CPV->PCAdjust == PCAdjust &&
-          (CPV->S == S || strcmp(CPV->S, S) == 0) &&
-          (CPV->Modifier == Modifier || strcmp(CPV->Modifier, Modifier) == 0))
+          CPV_streq(CPV->S, S) &&
+          CPV->Modifier == Modifier)
         return i;
     }
   }
@@ -91,8 +99,8 @@ ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   if (ACPV->Kind == Kind &&
       ACPV->CVal == CVal &&
       ACPV->PCAdjust == PCAdjust &&
-      (ACPV->S == S || strcmp(ACPV->S, S) == 0) &&
-      (ACPV->Modifier == Modifier || strcmp(ACPV->Modifier, Modifier) == 0)) {
+      CPV_streq(ACPV->S, S) &&
+      ACPV->Modifier == Modifier) {
     if (ACPV->LabelId == LabelId)
       return true;
     // Two PC relative constpool entries containing the same GV address or
@@ -113,7 +121,7 @@ void ARMConstantPoolValue::print(raw_ostream &O) const {
     O << CVal->getName();
   else
     O << S;
-  if (Modifier) O << "(" << Modifier << ")";
+  if (Modifier) O << "(" << getModifierText() << ")";
   if (PCAdjust != 0) {
     O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
     if (AddCurrentAddress) O << "-.";
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
index 3119b54..d008811 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cstddef>
 
 namespace llvm {
@@ -31,6 +32,15 @@ namespace ARMCP {
     CPBlockAddress,
     CPLSDA
   };
+
+  enum ARMCPModifier {
+    no_modifier,
+    TLSGD,
+    GOT,
+    GOTOFF,
+    GOTTPOFF,
+    TPOFF
+  };
 }
 
 /// ARMConstantPoolValue - ARM specific constantpool value. This is used to
@@ -43,26 +53,41 @@ class ARMConstantPoolValue : public MachineConstantPoolValue {
   ARMCP::ARMCPKind Kind;   // Kind of constant.
   unsigned char PCAdjust;  // Extra adjustment if constantpool is pc-relative.
                            // 8 for ARM, 4 for Thumb.
-  const char *Modifier;    // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+  ARMCP::ARMCPModifier Modifier;   // GV modifier i.e. (&GV(modifier)-(LPIC+8))
   bool AddCurrentAddress;
 
 public:
   ARMConstantPoolValue(const Constant *cval, unsigned id,
                        ARMCP::ARMCPKind Kind = ARMCP::CPValue,
-                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       unsigned char PCAdj = 0,
+                       ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier,
                        bool AddCurrentAddress = false);
   ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id,
-                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       unsigned char PCAdj = 0,
+                       ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier,
                        bool AddCurrentAddress = false);
-  ARMConstantPoolValue(const GlobalValue *GV, const char *Modifier);
+  ARMConstantPoolValue(const GlobalValue *GV, ARMCP::ARMCPModifier Modifier);
   ARMConstantPoolValue();
   ~ARMConstantPoolValue();
 
   const GlobalValue *getGV() const;
   const char *getSymbol() const { return S; }
   const BlockAddress *getBlockAddress() const;
-  const char *getModifier() const { return Modifier; }
-  bool hasModifier() const { return Modifier != NULL; }
+  ARMCP::ARMCPModifier getModifier() const { return Modifier; }
+  const char *getModifierText() const {
+    switch (Modifier) {
+    default: llvm_unreachable("Unknown modifier!");
+    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
+    // strings if that's legal.
+    case ARMCP::no_modifier: return "none";
+    case ARMCP::TLSGD:       return "tlsgd";
+    case ARMCP::GOT:         return "GOT";
+    case ARMCP::GOTOFF:      return "GOTOFF";
+    case ARMCP::GOTTPOFF:    return "gottpoff";
+    case ARMCP::TPOFF:       return "tpoff";
+    }
+  }
+  bool hasModifier() const { return Modifier != ARMCP::no_modifier; }
   bool mustAddCurrentAddress() const { return AddCurrentAddress; }
   unsigned getLabelId() const { return LabelId; }
   unsigned char getPCAdjustment() const { return PCAdjust; }
@@ -71,11 +96,7 @@ public:
   bool isBlockAddress() { return Kind == ARMCP::CPBlockAddress; }
   bool isLSDA() { return Kind == ARMCP::CPLSDA; }
 
-  virtual unsigned getRelocationInfo() const {
-    // FIXME: This is conservatively claiming that these entries require a
-    // relocation, we may be able to do better than this.
-    return 2;
-  }
+  virtual unsigned getRelocationInfo() const { return 2; }
 
   virtual int getExistingMachineCPValue(MachineConstantPool *CP,
                                         unsigned Alignment);
diff --git a/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp
new file mode 100644
index 0000000..51e68b4
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp
@@ -0,0 +1,83 @@
+//===-- ARMELFWriterInfo.cpp - ELF Writer Info for the ARM backend --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the ARM backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMELFWriterInfo.h"
+#include "ARMRelocations.h"
+#include "llvm/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Implementation of the ARMELFWriterInfo class
+//===----------------------------------------------------------------------===//
+
+ARMELFWriterInfo::ARMELFWriterInfo(TargetMachine &TM)
+  : TargetELFWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64,
+                        TM.getTargetData()->isLittleEndian()) {
+}
+
+ARMELFWriterInfo::~ARMELFWriterInfo() {}
+
+unsigned ARMELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
+  switch (MachineRelTy) {
+  case ARM::reloc_arm_absolute:
+  case ARM::reloc_arm_relative:
+  case ARM::reloc_arm_cp_entry:
+  case ARM::reloc_arm_vfp_cp_entry:
+  case ARM::reloc_arm_machine_cp_entry:
+  case ARM::reloc_arm_jt_base:
+  case ARM::reloc_arm_pic_jt:
+    assert(0 && "unsupported ARM relocation type"); break;
+    
+  case ARM::reloc_arm_branch: return ELF::R_ARM_CALL; break;
+  case ARM::reloc_arm_movt:   return ELF::R_ARM_MOVT_ABS; break;
+  case ARM::reloc_arm_movw:   return ELF::R_ARM_MOVW_ABS_NC; break;
+  default:
+    llvm_unreachable("unknown ARM relocation type"); break;
+  }
+  return 0;
+}
+
+long int ARMELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
+                                                    long int Modifier) const {
+  assert(0 && "ARMELFWriterInfo::getDefaultAddendForRelTy() not implemented");
+  return 0;
+}
+
+unsigned ARMELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
+  assert(0 && "ARMELFWriterInfo::getRelocationTySize() not implemented");
+  return 0;
+}
+
+bool ARMELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
+  assert(0 && "ARMELFWriterInfo::isPCRelativeRel() not implemented");
+  return 1;
+}
+
+unsigned ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
+  assert(0 &&
+         "ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not implemented");
+  return 0;
+}
+
+long int ARMELFWriterInfo::computeRelocation(unsigned SymOffset,
+                                             unsigned RelOffset,
+                                             unsigned RelTy) const {
+  assert(0 &&
+         "ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not implemented");
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h
new file mode 100644
index 0000000..1c4e532
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h
@@ -0,0 +1,58 @@
+//===-- ARMELFWriterInfo.h - ELF Writer Info for ARM ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the ARM backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_ELF_WRITER_INFO_H
+#define ARM_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+  class ARMELFWriterInfo : public TargetELFWriterInfo {
+  public:
+    ARMELFWriterInfo(TargetMachine &TM);
+    virtual ~ARMELFWriterInfo();
+
+    /// getRelocationType - Returns the target specific ELF Relocation type.
+    /// 'MachineRelTy' contains the object code independent relocation type
+    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
+
+    /// hasRelocationAddend - True if the target uses an addend in the
+    /// ELF relocation entry.
+    virtual bool hasRelocationAddend() const { return false; }
+
+    /// getDefaultAddendForRelTy - Gets the default addend value for a
+    /// relocation entry based on the target ELF relocation type.
+    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
+                                              long int Modifier = 0) const;
+
+    /// getRelTySize - Returns the size of relocatable field in bits
+    virtual unsigned getRelocationTySize(unsigned RelTy) const;
+
+    /// isPCRelativeRel - True if the relocation type is pc relative
+    virtual bool isPCRelativeRel(unsigned RelTy) const;
+
+    /// getJumpTableRelocationTy - Returns the machine relocation type used
+    /// to reference a jumptable.
+    virtual unsigned getAbsoluteLabelMachineRelTy() const;
+
+    /// computeRelocation - Some relocatable fields could be relocated
+    /// directly, avoiding the relocation symbol emission, compute the
+    /// final relocation value for this symbol.
+    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
+                                       unsigned RelTy) const;
+  };
+
+} // end llvm namespace
+
+#endif // ARM_ELF_WRITER_INFO_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index fc2e3c3..bd753d2 100644
--- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -7,36 +7,38 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains a pass that expand pseudo instructions into target
+// This file contains a pass that expands pseudo instructions into target
 // instructions to allow proper scheduling, if-conversion, and other late
 // optimizations. This pass should be run after register allocation but before
-// post- regalloc scheduling pass.
+// the post-regalloc scheduling pass.
 //
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "arm-pseudo"
 #include "ARM.h"
+#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
 using namespace llvm;
 
 namespace {
   class ARMExpandPseudo : public MachineFunctionPass {
-    // Constants for register spacing in NEON load/store instructions.
-    enum NEONRegSpacing {
-      SingleSpc,
-      EvenDblSpc,
-      OddDblSpc
-    };
-
   public:
     static char ID;
     ARMExpandPseudo() : MachineFunctionPass(ID) {}
 
-    const TargetInstrInfo *TII;
+    const ARMBaseInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
 
@@ -47,11 +49,16 @@ namespace {
   private:
     void TransferImpOps(MachineInstr &OldMI,
                         MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
+    bool ExpandMI(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MBBI);
     bool ExpandMBB(MachineBasicBlock &MBB);
-    void ExpandVLD(MachineBasicBlock::iterator &MBBI, unsigned Opc,
-                   bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs);
-    void ExpandVST(MachineBasicBlock::iterator &MBBI, unsigned Opc,
-                   bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs);
+    void ExpandVLD(MachineBasicBlock::iterator &MBBI);
+    void ExpandVST(MachineBasicBlock::iterator &MBBI);
+    void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
+    void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                    unsigned Opc, bool IsExt, unsigned NumRegs);
+    void ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MBBI);
   };
   char ARMExpandPseudo::ID = 0;
 }
@@ -67,44 +74,349 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
     const MachineOperand &MO = OldMI.getOperand(i);
     assert(MO.isReg() && MO.getReg());
     if (MO.isUse())
-      UseMI.addReg(MO.getReg(), getKillRegState(MO.isKill()));
+      UseMI.addOperand(MO);
     else
-      DefMI.addReg(MO.getReg(),
-                   getDefRegState(true) | getDeadRegState(MO.isDead()));
+      DefMI.addOperand(MO);
+  }
+}
+
+namespace {
+  // Constants for register spacing in NEON load/store instructions.
+  // For quad-register load-lane and store-lane pseudo instructors, the
+  // spacing is initially assumed to be EvenDblSpc, and that is changed to
+  // OddDblSpc depending on the lane number operand.
+  enum NEONRegSpacing {
+    SingleSpc,
+    EvenDblSpc,
+    OddDblSpc
+  };
+
+  // Entries for NEON load/store information table.  The table is sorted by
+  // PseudoOpc for fast binary-search lookups.
+  struct NEONLdStTableEntry {
+    unsigned PseudoOpc;
+    unsigned RealOpc;
+    bool IsLoad;
+    bool HasWriteBack;
+    NEONRegSpacing RegSpacing;
+    unsigned char NumRegs; // D registers loaded or stored
+    unsigned char RegElts; // elements per D register; used for lane ops
+
+    // Comparison methods for binary search of the table.
+    bool operator<(const NEONLdStTableEntry &TE) const {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+    friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
+      return TE.PseudoOpc < PseudoOpc;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
+                                                const NEONLdStTableEntry &TE) {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+  };
+}
+
+static const NEONLdStTableEntry NEONLdStTable[] = {
+{ ARM::VLD1DUPq16Pseudo,     ARM::VLD1DUPq16,     true, false, SingleSpc, 2, 4},
+{ ARM::VLD1DUPq16Pseudo_UPD, ARM::VLD1DUPq16_UPD, true, true,  SingleSpc, 2, 4},
+{ ARM::VLD1DUPq32Pseudo,     ARM::VLD1DUPq32,     true, false, SingleSpc, 2, 2},
+{ ARM::VLD1DUPq32Pseudo_UPD, ARM::VLD1DUPq32_UPD, true, true,  SingleSpc, 2, 2},
+{ ARM::VLD1DUPq8Pseudo,      ARM::VLD1DUPq8,      true, false, SingleSpc, 2, 8},
+{ ARM::VLD1DUPq8Pseudo_UPD,  ARM::VLD1DUPq8_UPD,  true, true,  SingleSpc, 2, 8},
+
+{ ARM::VLD1LNq16Pseudo,     ARM::VLD1LNd16,     true, false, EvenDblSpc, 1, 4 },
+{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true,  EvenDblSpc, 1, 4 },
+{ ARM::VLD1LNq32Pseudo,     ARM::VLD1LNd32,     true, false, EvenDblSpc, 1, 2 },
+{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true,  EvenDblSpc, 1, 2 },
+{ ARM::VLD1LNq8Pseudo,      ARM::VLD1LNd8,      true, false, EvenDblSpc, 1, 8 },
+{ ARM::VLD1LNq8Pseudo_UPD,  ARM::VLD1LNd8_UPD,  true, true,  EvenDblSpc, 1, 8 },
+
+{ ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, SingleSpc,  4, 1 },
+{ ARM::VLD1d64QPseudo_UPD,  ARM::VLD1d64Q_UPD, true,  true,  SingleSpc,  4, 1 },
+{ ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, SingleSpc,  3, 1 },
+{ ARM::VLD1d64TPseudo_UPD,  ARM::VLD1d64T_UPD, true,  true,  SingleSpc,  3, 1 },
+
+{ ARM::VLD1q16Pseudo,       ARM::VLD1q16,      true,  false, SingleSpc,  2, 4 },
+{ ARM::VLD1q16Pseudo_UPD,   ARM::VLD1q16_UPD,  true,  true,  SingleSpc,  2, 4 },
+{ ARM::VLD1q32Pseudo,       ARM::VLD1q32,      true,  false, SingleSpc,  2, 2 },
+{ ARM::VLD1q32Pseudo_UPD,   ARM::VLD1q32_UPD,  true,  true,  SingleSpc,  2, 2 },
+{ ARM::VLD1q64Pseudo,       ARM::VLD1q64,      true,  false, SingleSpc,  2, 1 },
+{ ARM::VLD1q64Pseudo_UPD,   ARM::VLD1q64_UPD,  true,  true,  SingleSpc,  2, 1 },
+{ ARM::VLD1q8Pseudo,        ARM::VLD1q8,       true,  false, SingleSpc,  2, 8 },
+{ ARM::VLD1q8Pseudo_UPD,    ARM::VLD1q8_UPD,   true,  true,  SingleSpc,  2, 8 },
+
+{ ARM::VLD2DUPd16Pseudo,     ARM::VLD2DUPd16,     true, false, SingleSpc, 2, 4},
+{ ARM::VLD2DUPd16Pseudo_UPD, ARM::VLD2DUPd16_UPD, true, true,  SingleSpc, 2, 4},
+{ ARM::VLD2DUPd32Pseudo,     ARM::VLD2DUPd32,     true, false, SingleSpc, 2, 2},
+{ ARM::VLD2DUPd32Pseudo_UPD, ARM::VLD2DUPd32_UPD, true, true,  SingleSpc, 2, 2},
+{ ARM::VLD2DUPd8Pseudo,      ARM::VLD2DUPd8,      true, false, SingleSpc, 2, 8},
+{ ARM::VLD2DUPd8Pseudo_UPD,  ARM::VLD2DUPd8_UPD,  true, true,  SingleSpc, 2, 8},
+
+{ ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, SingleSpc,  2, 4 },
+{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true,  SingleSpc,  2, 4 },
+{ ARM::VLD2LNd32Pseudo,     ARM::VLD2LNd32,     true, false, SingleSpc,  2, 2 },
+{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true,  SingleSpc,  2, 2 },
+{ ARM::VLD2LNd8Pseudo,      ARM::VLD2LNd8,      true, false, SingleSpc,  2, 8 },
+{ ARM::VLD2LNd8Pseudo_UPD,  ARM::VLD2LNd8_UPD,  true, true,  SingleSpc,  2, 8 },
+{ ARM::VLD2LNq16Pseudo,     ARM::VLD2LNq16,     true, false, EvenDblSpc, 2, 4 },
+{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true,  EvenDblSpc, 2, 4 },
+{ ARM::VLD2LNq32Pseudo,     ARM::VLD2LNq32,     true, false, EvenDblSpc, 2, 2 },
+{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true,  EvenDblSpc, 2, 2 },
+
+{ ARM::VLD2d16Pseudo,       ARM::VLD2d16,      true,  false, SingleSpc,  2, 4 },
+{ ARM::VLD2d16Pseudo_UPD,   ARM::VLD2d16_UPD,  true,  true,  SingleSpc,  2, 4 },
+{ ARM::VLD2d32Pseudo,       ARM::VLD2d32,      true,  false, SingleSpc,  2, 2 },
+{ ARM::VLD2d32Pseudo_UPD,   ARM::VLD2d32_UPD,  true,  true,  SingleSpc,  2, 2 },
+{ ARM::VLD2d8Pseudo,        ARM::VLD2d8,       true,  false, SingleSpc,  2, 8 },
+{ ARM::VLD2d8Pseudo_UPD,    ARM::VLD2d8_UPD,   true,  true,  SingleSpc,  2, 8 },
+
+{ ARM::VLD2q16Pseudo,       ARM::VLD2q16,      true,  false, SingleSpc,  4, 4 },
+{ ARM::VLD2q16Pseudo_UPD,   ARM::VLD2q16_UPD,  true,  true,  SingleSpc,  4, 4 },
+{ ARM::VLD2q32Pseudo,       ARM::VLD2q32,      true,  false, SingleSpc,  4, 2 },
+{ ARM::VLD2q32Pseudo_UPD,   ARM::VLD2q32_UPD,  true,  true,  SingleSpc,  4, 2 },
+{ ARM::VLD2q8Pseudo,        ARM::VLD2q8,       true,  false, SingleSpc,  4, 8 },
+{ ARM::VLD2q8Pseudo_UPD,    ARM::VLD2q8_UPD,   true,  true,  SingleSpc,  4, 8 },
+
+{ ARM::VLD3DUPd16Pseudo,     ARM::VLD3DUPd16,     true, false, SingleSpc, 3, 4},
+{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true,  SingleSpc, 3, 4},
+{ ARM::VLD3DUPd32Pseudo,     ARM::VLD3DUPd32,     true, false, SingleSpc, 3, 2},
+{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true,  SingleSpc, 3, 2},
+{ ARM::VLD3DUPd8Pseudo,      ARM::VLD3DUPd8,      true, false, SingleSpc, 3, 8},
+{ ARM::VLD3DUPd8Pseudo_UPD,  ARM::VLD3DUPd8_UPD,  true, true,  SingleSpc, 3, 8},
+
+{ ARM::VLD3LNd16Pseudo,     ARM::VLD3LNd16,     true, false, SingleSpc,  3, 4 },
+{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true,  SingleSpc,  3, 4 },
+{ ARM::VLD3LNd32Pseudo,     ARM::VLD3LNd32,     true, false, SingleSpc,  3, 2 },
+{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true,  SingleSpc,  3, 2 },
+{ ARM::VLD3LNd8Pseudo,      ARM::VLD3LNd8,      true, false, SingleSpc,  3, 8 },
+{ ARM::VLD3LNd8Pseudo_UPD,  ARM::VLD3LNd8_UPD,  true, true,  SingleSpc,  3, 8 },
+{ ARM::VLD3LNq16Pseudo,     ARM::VLD3LNq16,     true, false, EvenDblSpc, 3, 4 },
+{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true,  EvenDblSpc, 3, 4 },
+{ ARM::VLD3LNq32Pseudo,     ARM::VLD3LNq32,     true, false, EvenDblSpc, 3, 2 },
+{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true,  EvenDblSpc, 3, 2 },
+
+{ ARM::VLD3d16Pseudo,       ARM::VLD3d16,      true,  false, SingleSpc,  3, 4 },
+{ ARM::VLD3d16Pseudo_UPD,   ARM::VLD3d16_UPD,  true,  true,  SingleSpc,  3, 4 },
+{ ARM::VLD3d32Pseudo,       ARM::VLD3d32,      true,  false, SingleSpc,  3, 2 },
+{ ARM::VLD3d32Pseudo_UPD,   ARM::VLD3d32_UPD,  true,  true,  SingleSpc,  3, 2 },
+{ ARM::VLD3d8Pseudo,        ARM::VLD3d8,       true,  false, SingleSpc,  3, 8 },
+{ ARM::VLD3d8Pseudo_UPD,    ARM::VLD3d8_UPD,   true,  true,  SingleSpc,  3, 8 },
+
+{ ARM::VLD3q16Pseudo_UPD,    ARM::VLD3q16_UPD, true,  true,  EvenDblSpc, 3, 4 },
+{ ARM::VLD3q16oddPseudo,     ARM::VLD3q16,     true,  false, OddDblSpc,  3, 4 },
+{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true,  true,  OddDblSpc,  3, 4 },
+{ ARM::VLD3q32Pseudo_UPD,    ARM::VLD3q32_UPD, true,  true,  EvenDblSpc, 3, 2 },
+{ ARM::VLD3q32oddPseudo,     ARM::VLD3q32,     true,  false, OddDblSpc,  3, 2 },
+{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true,  true,  OddDblSpc,  3, 2 },
+{ ARM::VLD3q8Pseudo_UPD,     ARM::VLD3q8_UPD,  true,  true,  EvenDblSpc, 3, 8 },
+{ ARM::VLD3q8oddPseudo,      ARM::VLD3q8,      true,  false, OddDblSpc,  3, 8 },
+{ ARM::VLD3q8oddPseudo_UPD,  ARM::VLD3q8_UPD,  true,  true,  OddDblSpc,  3, 8 },
+
+{ ARM::VLD4DUPd16Pseudo,     ARM::VLD4DUPd16,     true, false, SingleSpc, 4, 4},
+{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true,  SingleSpc, 4, 4},
+{ ARM::VLD4DUPd32Pseudo,     ARM::VLD4DUPd32,     true, false, SingleSpc, 4, 2},
+{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true,  SingleSpc, 4, 2},
+{ ARM::VLD4DUPd8Pseudo,      ARM::VLD4DUPd8,      true, false, SingleSpc, 4, 8},
+{ ARM::VLD4DUPd8Pseudo_UPD,  ARM::VLD4DUPd8_UPD,  true, true,  SingleSpc, 4, 8},
+
+{ ARM::VLD4LNd16Pseudo,     ARM::VLD4LNd16,     true, false, SingleSpc,  4, 4 },
+{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true,  SingleSpc,  4, 4 },
+{ ARM::VLD4LNd32Pseudo,     ARM::VLD4LNd32,     true, false, SingleSpc,  4, 2 },
+{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true,  SingleSpc,  4, 2 },
+{ ARM::VLD4LNd8Pseudo,      ARM::VLD4LNd8,      true, false, SingleSpc,  4, 8 },
+{ ARM::VLD4LNd8Pseudo_UPD,  ARM::VLD4LNd8_UPD,  true, true,  SingleSpc,  4, 8 },
+{ ARM::VLD4LNq16Pseudo,     ARM::VLD4LNq16,     true, false, EvenDblSpc, 4, 4 },
+{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true,  EvenDblSpc, 4, 4 },
+{ ARM::VLD4LNq32Pseudo,     ARM::VLD4LNq32,     true, false, EvenDblSpc, 4, 2 },
+{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true,  EvenDblSpc, 4, 2 },
+
+{ ARM::VLD4d16Pseudo,       ARM::VLD4d16,      true,  false, SingleSpc,  4, 4 },
+{ ARM::VLD4d16Pseudo_UPD,   ARM::VLD4d16_UPD,  true,  true,  SingleSpc,  4, 4 },
+{ ARM::VLD4d32Pseudo,       ARM::VLD4d32,      true,  false, SingleSpc,  4, 2 },
+{ ARM::VLD4d32Pseudo_UPD,   ARM::VLD4d32_UPD,  true,  true,  SingleSpc,  4, 2 },
+{ ARM::VLD4d8Pseudo,        ARM::VLD4d8,       true,  false, SingleSpc,  4, 8 },
+{ ARM::VLD4d8Pseudo_UPD,    ARM::VLD4d8_UPD,   true,  true,  SingleSpc,  4, 8 },
+
+{ ARM::VLD4q16Pseudo_UPD,    ARM::VLD4q16_UPD, true,  true,  EvenDblSpc, 4, 4 },
+{ ARM::VLD4q16oddPseudo,     ARM::VLD4q16,     true,  false, OddDblSpc,  4, 4 },
+{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true,  true,  OddDblSpc,  4, 4 },
+{ ARM::VLD4q32Pseudo_UPD,    ARM::VLD4q32_UPD, true,  true,  EvenDblSpc, 4, 2 },
+{ ARM::VLD4q32oddPseudo,     ARM::VLD4q32,     true,  false, OddDblSpc,  4, 2 },
+{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true,  true,  OddDblSpc,  4, 2 },
+{ ARM::VLD4q8Pseudo_UPD,     ARM::VLD4q8_UPD,  true,  true,  EvenDblSpc, 4, 8 },
+{ ARM::VLD4q8oddPseudo,      ARM::VLD4q8,      true,  false, OddDblSpc,  4, 8 },
+{ ARM::VLD4q8oddPseudo_UPD,  ARM::VLD4q8_UPD,  true,  true,  OddDblSpc,  4, 8 },
+
+{ ARM::VST1LNq16Pseudo,     ARM::VST1LNd16,    false, false, EvenDblSpc, 1, 4 },
+{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD,false, true,  EvenDblSpc, 1, 4 },
+{ ARM::VST1LNq32Pseudo,     ARM::VST1LNd32,    false, false, EvenDblSpc, 1, 2 },
+{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD,false, true,  EvenDblSpc, 1, 2 },
+{ ARM::VST1LNq8Pseudo,      ARM::VST1LNd8,     false, false, EvenDblSpc, 1, 8 },
+{ ARM::VST1LNq8Pseudo_UPD,  ARM::VST1LNd8_UPD, false, true,  EvenDblSpc, 1, 8 },
+
+{ ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, SingleSpc,  4, 1 },
+{ ARM::VST1d64QPseudo_UPD,  ARM::VST1d64Q_UPD, false, true,  SingleSpc,  4, 1 },
+{ ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, SingleSpc,  3, 1 },
+{ ARM::VST1d64TPseudo_UPD,  ARM::VST1d64T_UPD, false, true,  SingleSpc,  3, 1 },
+
+{ ARM::VST1q16Pseudo,       ARM::VST1q16,      false, false, SingleSpc,  2, 4 },
+{ ARM::VST1q16Pseudo_UPD,   ARM::VST1q16_UPD,  false, true,  SingleSpc,  2, 4 },
+{ ARM::VST1q32Pseudo,       ARM::VST1q32,      false, false, SingleSpc,  2, 2 },
+{ ARM::VST1q32Pseudo_UPD,   ARM::VST1q32_UPD,  false, true,  SingleSpc,  2, 2 },
+{ ARM::VST1q64Pseudo,       ARM::VST1q64,      false, false, SingleSpc,  2, 1 },
+{ ARM::VST1q64Pseudo_UPD,   ARM::VST1q64_UPD,  false, true,  SingleSpc,  2, 1 },
+{ ARM::VST1q8Pseudo,        ARM::VST1q8,       false, false, SingleSpc,  2, 8 },
+{ ARM::VST1q8Pseudo_UPD,    ARM::VST1q8_UPD,   false, true,  SingleSpc,  2, 8 },
+
+{ ARM::VST2LNd16Pseudo,     ARM::VST2LNd16,     false, false, SingleSpc, 2, 4 },
+{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true,  SingleSpc, 2, 4 },
+{ ARM::VST2LNd32Pseudo,     ARM::VST2LNd32,     false, false, SingleSpc, 2, 2 },
+{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true,  SingleSpc, 2, 2 },
+{ ARM::VST2LNd8Pseudo,      ARM::VST2LNd8,      false, false, SingleSpc, 2, 8 },
+{ ARM::VST2LNd8Pseudo_UPD,  ARM::VST2LNd8_UPD,  false, true,  SingleSpc, 2, 8 },
+{ ARM::VST2LNq16Pseudo,     ARM::VST2LNq16,     false, false, EvenDblSpc, 2, 4},
+{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true,  EvenDblSpc, 2, 4},
+{ ARM::VST2LNq32Pseudo,     ARM::VST2LNq32,     false, false, EvenDblSpc, 2, 2},
+{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true,  EvenDblSpc, 2, 2},
+
+{ ARM::VST2d16Pseudo,       ARM::VST2d16,      false, false, SingleSpc,  2, 4 },
+{ ARM::VST2d16Pseudo_UPD,   ARM::VST2d16_UPD,  false, true,  SingleSpc,  2, 4 },
+{ ARM::VST2d32Pseudo,       ARM::VST2d32,      false, false, SingleSpc,  2, 2 },
+{ ARM::VST2d32Pseudo_UPD,   ARM::VST2d32_UPD,  false, true,  SingleSpc,  2, 2 },
+{ ARM::VST2d8Pseudo,        ARM::VST2d8,       false, false, SingleSpc,  2, 8 },
+{ ARM::VST2d8Pseudo_UPD,    ARM::VST2d8_UPD,   false, true,  SingleSpc,  2, 8 },
+
+{ ARM::VST2q16Pseudo,       ARM::VST2q16,      false, false, SingleSpc,  4, 4 },
+{ ARM::VST2q16Pseudo_UPD,   ARM::VST2q16_UPD,  false, true,  SingleSpc,  4, 4 },
+{ ARM::VST2q32Pseudo,       ARM::VST2q32,      false, false, SingleSpc,  4, 2 },
+{ ARM::VST2q32Pseudo_UPD,   ARM::VST2q32_UPD,  false, true,  SingleSpc,  4, 2 },
+{ ARM::VST2q8Pseudo,        ARM::VST2q8,       false, false, SingleSpc,  4, 8 },
+{ ARM::VST2q8Pseudo_UPD,    ARM::VST2q8_UPD,   false, true,  SingleSpc,  4, 8 },
+
+{ ARM::VST3LNd16Pseudo,     ARM::VST3LNd16,     false, false, SingleSpc, 3, 4 },
+{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true,  SingleSpc, 3, 4 },
+{ ARM::VST3LNd32Pseudo,     ARM::VST3LNd32,     false, false, SingleSpc, 3, 2 },
+{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true,  SingleSpc, 3, 2 },
+{ ARM::VST3LNd8Pseudo,      ARM::VST3LNd8,      false, false, SingleSpc, 3, 8 },
+{ ARM::VST3LNd8Pseudo_UPD,  ARM::VST3LNd8_UPD,  false, true,  SingleSpc, 3, 8 },
+{ ARM::VST3LNq16Pseudo,     ARM::VST3LNq16,     false, false, EvenDblSpc, 3, 4},
+{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true,  EvenDblSpc, 3, 4},
+{ ARM::VST3LNq32Pseudo,     ARM::VST3LNq32,     false, false, EvenDblSpc, 3, 2},
+{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true,  EvenDblSpc, 3, 2},
+
+{ ARM::VST3d16Pseudo,       ARM::VST3d16,      false, false, SingleSpc,  3, 4 },
+{ ARM::VST3d16Pseudo_UPD,   ARM::VST3d16_UPD,  false, true,  SingleSpc,  3, 4 },
+{ ARM::VST3d32Pseudo,       ARM::VST3d32,      false, false, SingleSpc,  3, 2 },
+{ ARM::VST3d32Pseudo_UPD,   ARM::VST3d32_UPD,  false, true,  SingleSpc,  3, 2 },
+{ ARM::VST3d8Pseudo,        ARM::VST3d8,       false, false, SingleSpc,  3, 8 },
+{ ARM::VST3d8Pseudo_UPD,    ARM::VST3d8_UPD,   false, true,  SingleSpc,  3, 8 },
+
+{ ARM::VST3q16Pseudo_UPD,    ARM::VST3q16_UPD, false, true,  EvenDblSpc, 3, 4 },
+{ ARM::VST3q16oddPseudo,     ARM::VST3q16,     false, false, OddDblSpc,  3, 4 },
+{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true,  OddDblSpc,  3, 4 },
+{ ARM::VST3q32Pseudo_UPD,    ARM::VST3q32_UPD, false, true,  EvenDblSpc, 3, 2 },
+{ ARM::VST3q32oddPseudo,     ARM::VST3q32,     false, false, OddDblSpc,  3, 2 },
+{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true,  OddDblSpc,  3, 2 },
+{ ARM::VST3q8Pseudo_UPD,     ARM::VST3q8_UPD,  false, true,  EvenDblSpc, 3, 8 },
+{ ARM::VST3q8oddPseudo,      ARM::VST3q8,      false, false, OddDblSpc,  3, 8 },
+{ ARM::VST3q8oddPseudo_UPD,  ARM::VST3q8_UPD,  false, true,  OddDblSpc,  3, 8 },
+
+{ ARM::VST4LNd16Pseudo,     ARM::VST4LNd16,     false, false, SingleSpc, 4, 4 },
+{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true,  SingleSpc, 4, 4 },
+{ ARM::VST4LNd32Pseudo,     ARM::VST4LNd32,     false, false, SingleSpc, 4, 2 },
+{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true,  SingleSpc, 4, 2 },
+{ ARM::VST4LNd8Pseudo,      ARM::VST4LNd8,      false, false, SingleSpc, 4, 8 },
+{ ARM::VST4LNd8Pseudo_UPD,  ARM::VST4LNd8_UPD,  false, true,  SingleSpc, 4, 8 },
+{ ARM::VST4LNq16Pseudo,     ARM::VST4LNq16,     false, false, EvenDblSpc, 4, 4},
+{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true,  EvenDblSpc, 4, 4},
+{ ARM::VST4LNq32Pseudo,     ARM::VST4LNq32,     false, false, EvenDblSpc, 4, 2},
+{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true,  EvenDblSpc, 4, 2},
+
+{ ARM::VST4d16Pseudo,       ARM::VST4d16,      false, false, SingleSpc,  4, 4 },
+{ ARM::VST4d16Pseudo_UPD,   ARM::VST4d16_UPD,  false, true,  SingleSpc,  4, 4 },
+{ ARM::VST4d32Pseudo,       ARM::VST4d32,      false, false, SingleSpc,  4, 2 },
+{ ARM::VST4d32Pseudo_UPD,   ARM::VST4d32_UPD,  false, true,  SingleSpc,  4, 2 },
+{ ARM::VST4d8Pseudo,        ARM::VST4d8,       false, false, SingleSpc,  4, 8 },
+{ ARM::VST4d8Pseudo_UPD,    ARM::VST4d8_UPD,   false, true,  SingleSpc,  4, 8 },
+
+{ ARM::VST4q16Pseudo_UPD,    ARM::VST4q16_UPD, false, true,  EvenDblSpc, 4, 4 },
+{ ARM::VST4q16oddPseudo,     ARM::VST4q16,     false, false, OddDblSpc,  4, 4 },
+{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true,  OddDblSpc,  4, 4 },
+{ ARM::VST4q32Pseudo_UPD,    ARM::VST4q32_UPD, false, true,  EvenDblSpc, 4, 2 },
+{ ARM::VST4q32oddPseudo,     ARM::VST4q32,     false, false, OddDblSpc,  4, 2 },
+{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true,  OddDblSpc,  4, 2 },
+{ ARM::VST4q8Pseudo_UPD,     ARM::VST4q8_UPD,  false, true,  EvenDblSpc, 4, 8 },
+{ ARM::VST4q8oddPseudo,      ARM::VST4q8,      false, false, OddDblSpc,  4, 8 },
+{ ARM::VST4q8oddPseudo_UPD,  ARM::VST4q8_UPD,  false, true,  OddDblSpc,  4, 8 }
+};
+
+/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
+/// load or store pseudo instruction.
+static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
+  unsigned NumEntries = array_lengthof(NEONLdStTable);
+
+#ifndef NDEBUG
+  // Make sure the table is sorted.
+  static bool TableChecked = false;
+  if (!TableChecked) {
+    for (unsigned i = 0; i != NumEntries-1; ++i)
+      assert(NEONLdStTable[i] < NEONLdStTable[i+1] &&
+             "NEONLdStTable is not sorted!");
+    TableChecked = true;
+  }
+#endif
+
+  const NEONLdStTableEntry *I =
+    std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode);
+  if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode)
+    return I;
+  return NULL;
+}
+
+/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
+/// corresponding to the specified register spacing.  Not all of the results
+/// are necessarily valid, e.g., a Q register only has 2 D subregisters.
+static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
+                        const TargetRegisterInfo *TRI, unsigned &D0,
+                        unsigned &D1, unsigned &D2, unsigned &D3) {
+  if (RegSpc == SingleSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_3);
+  } else if (RegSpc == EvenDblSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_4);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_6);
+  } else {
+    assert(RegSpc == OddDblSpc && "unknown register spacing");
+    D0 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_3);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_5);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_7);
   }
 }
 
 /// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register
 /// operands to real VLD instructions with D register operands.
-void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI,
-                                unsigned Opc, bool hasWriteBack,
-                                NEONRegSpacing RegSpc, unsigned NumRegs) {
+void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
   MachineBasicBlock &MBB = *MI.getParent();
 
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
   unsigned OpIdx = 0;
 
   bool DstIsDead = MI.getOperand(OpIdx).isDead();
   unsigned DstReg = MI.getOperand(OpIdx++).getReg();
   unsigned D0, D1, D2, D3;
-  if (RegSpc == SingleSpc) {
-    D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
-    D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
-    D2 = TRI->getSubReg(DstReg, ARM::dsub_2);
-    D3 = TRI->getSubReg(DstReg, ARM::dsub_3);
-  } else if (RegSpc == EvenDblSpc) {
-    D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
-    D1 = TRI->getSubReg(DstReg, ARM::dsub_2);
-    D2 = TRI->getSubReg(DstReg, ARM::dsub_4);
-    D3 = TRI->getSubReg(DstReg, ARM::dsub_6);
-  } else {
-    assert(RegSpc == OddDblSpc && "unknown register spacing for VLD");
-    D0 = TRI->getSubReg(DstReg, ARM::dsub_1);
-    D1 = TRI->getSubReg(DstReg, ARM::dsub_3);
-    D2 = TRI->getSubReg(DstReg, ARM::dsub_5);
-    D3 = TRI->getSubReg(DstReg, ARM::dsub_7);
-  } 
+  GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
   MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
     .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
   if (NumRegs > 2)
@@ -112,107 +424,373 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI,
   if (NumRegs > 3)
     MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
 
-  if (hasWriteBack) {
-    bool WBIsDead = MI.getOperand(OpIdx).isDead();
-    unsigned WBReg = MI.getOperand(OpIdx++).getReg();
-    MIB.addReg(WBReg, RegState::Define | getDeadRegState(WBIsDead));
-  }
+  if (TableEntry->HasWriteBack)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
   // Copy the addrmode6 operands.
-  bool AddrIsKill = MI.getOperand(OpIdx).isKill();
-  MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill));
-  MIB.addImm(MI.getOperand(OpIdx++).getImm());
-  if (hasWriteBack) {
-    // Copy the am6offset operand.
-    bool OffsetIsKill = MI.getOperand(OpIdx).isKill();
-    MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill));
-  }
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->HasWriteBack)
+    MIB.addOperand(MI.getOperand(OpIdx++));
 
-  MIB = AddDefaultPred(MIB);
-  TransferImpOps(MI, MIB, MIB);
-  // For an instruction writing the odd subregs, add an implicit use of the
-  // super-register because the even subregs were loaded separately.
-  if (RegSpc == OddDblSpc)
-    MIB.addReg(DstReg, RegState::Implicit);
+  // For an instruction writing double-spaced subregs, the pseudo instruction
+  // has an extra operand that is a use of the super-register.  Record the
+  // operand index and skip over it.
+  unsigned SrcOpIdx = 0;
+  if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc)
+    SrcOpIdx = OpIdx++;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source operand used for double-spaced subregs over
+  // to the new instruction as an implicit operand.
+  if (SrcOpIdx != 0) {
+    MachineOperand MO = MI.getOperand(SrcOpIdx);
+    MO.setImplicit(true);
+    MIB.addOperand(MO);
+  }
   // Add an implicit def for the super-register.
   MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  TransferImpOps(MI, MIB, MIB);
   MI.eraseFromParent();
 }
 
 /// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
 /// operands to real VST instructions with D register operands.
-void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI,
-                                unsigned Opc, bool hasWriteBack,
-                                NEONRegSpacing RegSpc, unsigned NumRegs) {
+void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
   MachineBasicBlock &MBB = *MI.getParent();
 
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
   unsigned OpIdx = 0;
-  if (hasWriteBack) {
-    bool DstIsDead = MI.getOperand(OpIdx).isDead();
-    unsigned DstReg = MI.getOperand(OpIdx++).getReg();
-    MIB.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
-  }
+  if (TableEntry->HasWriteBack)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
   // Copy the addrmode6 operands.
-  bool AddrIsKill = MI.getOperand(OpIdx).isKill();
-  MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill));
-  MIB.addImm(MI.getOperand(OpIdx++).getImm());
-  if (hasWriteBack) {
-    // Copy the am6offset operand.
-    bool OffsetIsKill = MI.getOperand(OpIdx).isKill();
-    MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill));
-  }
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->HasWriteBack)
+    MIB.addOperand(MI.getOperand(OpIdx++));
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
-  unsigned SrcReg = MI.getOperand(OpIdx).getReg();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
   unsigned D0, D1, D2, D3;
-  if (RegSpc == SingleSpc) {
-    D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
-    D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
-    D2 = TRI->getSubReg(SrcReg, ARM::dsub_2);
-    D3 = TRI->getSubReg(SrcReg, ARM::dsub_3);
-  } else if (RegSpc == EvenDblSpc) {
-    D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
-    D1 = TRI->getSubReg(SrcReg, ARM::dsub_2);
-    D2 = TRI->getSubReg(SrcReg, ARM::dsub_4);
-    D3 = TRI->getSubReg(SrcReg, ARM::dsub_6);
-  } else {
-    assert(RegSpc == OddDblSpc && "unknown register spacing for VST");
-    D0 = TRI->getSubReg(SrcReg, ARM::dsub_1);
-    D1 = TRI->getSubReg(SrcReg, ARM::dsub_3);
-    D2 = TRI->getSubReg(SrcReg, ARM::dsub_5);
-    D3 = TRI->getSubReg(SrcReg, ARM::dsub_7);
-  } 
-
+  GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
   MIB.addReg(D0).addReg(D1);
   if (NumRegs > 2)
     MIB.addReg(D2);
   if (NumRegs > 3)
     MIB.addReg(D3);
-  MIB = AddDefaultPred(MIB);
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  if (SrcIsKill)
+    // Add an implicit kill for the super-reg.
+    (*MIB).addRegisterKilled(SrcReg, TRI, true);
+  TransferImpOps(MI, MIB, MIB);
+  MI.eraseFromParent();
+}
+
+/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+  unsigned RegElts = TableEntry->RegElts;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+  // The lane operand is always the 3rd from last operand, before the 2
+  // predicate operands.
+  unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm();
+
+  // Adjust the lane and spacing as needed for Q registers.
+  assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane");
+  if (RegSpc == EvenDblSpc && Lane >= RegElts) {
+    RegSpc = OddDblSpc;
+    Lane -= RegElts;
+  }
+  assert(Lane < RegElts && "out of range lane for VLD/VST-lane");
+
+  unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0;
+  unsigned DstReg = 0;
+  bool DstIsDead = false;
+  if (TableEntry->IsLoad) {
+    DstIsDead = MI.getOperand(OpIdx).isDead();
+    DstReg = MI.getOperand(OpIdx++).getReg();
+    GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+    MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 1)
+      MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 2)
+      MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 3)
+      MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+  }
+
+  if (TableEntry->HasWriteBack)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->HasWriteBack)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Grab the super-register source.
+  MachineOperand MO = MI.getOperand(OpIdx++);
+  if (!TableEntry->IsLoad)
+    GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3);
+
+  // Add the subregs as sources of the new instruction.
+  unsigned SrcFlags = (getUndefRegState(MO.isUndef()) |
+                       getKillRegState(MO.isKill()));
+  MIB.addReg(D0, SrcFlags);
+  if (NumRegs > 1)
+    MIB.addReg(D1, SrcFlags);
+  if (NumRegs > 2)
+    MIB.addReg(D2, SrcFlags);
+  if (NumRegs > 3)
+    MIB.addReg(D3, SrcFlags);
+
+  // Add the lane number operand.
+  MIB.addImm(Lane);
+  OpIdx += 1;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source to be an implicit source.
+  MO.setImplicit(true);
+  MIB.addOperand(MO);
+  if (TableEntry->IsLoad)
+    // Add an implicit def for the super-register.
+    MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
   TransferImpOps(MI, MIB, MIB);
+  MI.eraseFromParent();
+}
+
+/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                                 unsigned Opc, bool IsExt, unsigned NumRegs) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  unsigned OpIdx = 0;
+
+  // Transfer the destination register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  if (IsExt)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0).addReg(D1);
+  if (NumRegs > 2)
+    MIB.addReg(D2);
+  if (NumRegs > 3)
+    MIB.addReg(D3);
+
+  // Copy the other source register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
   if (SrcIsKill)
     // Add an implicit kill for the super-reg.
     (*MIB).addRegisterKilled(SrcReg, TRI, true);
+  TransferImpOps(MI, MIB, MIB);
   MI.eraseFromParent();
 }
 
-bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
-  bool Modified = false;
+void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
+  const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
+  MachineInstrBuilder LO16, HI16;
 
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
-    MachineInstr &MI = *MBBI;
-    MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
+  if (!STI->hasV6T2Ops() &&
+      (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
+    // Expand into a movi + orr.
+    LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
+    HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg);
+
+    assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
+    unsigned ImmVal = (unsigned)MO.getImm();
+    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    LO16 = LO16.addImm(SOImmValV1);
+    HI16 = HI16.addImm(SOImmValV2);
+    (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    LO16.addImm(Pred).addReg(PredReg).addReg(0);
+    HI16.addImm(Pred).addReg(PredReg).addReg(0);
+    TransferImpOps(MI, LO16, HI16);
+    MI.eraseFromParent();
+    return;
+  }
+
+  unsigned LO16Opc = 0;
+  unsigned HI16Opc = 0;
+  if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
+    LO16Opc = ARM::t2MOVi16;
+    HI16Opc = ARM::t2MOVTi16;
+  } else {
+    LO16Opc = ARM::MOVi16;
+    HI16Opc = ARM::MOVTi16;
+  }
 
-    bool ModifiedOp = true;
-    unsigned Opcode = MI.getOpcode();
-    switch (Opcode) {
+  LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
+  HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
+    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstReg);
+
+  if (MO.isImm()) {
+    unsigned Imm = MO.getImm();
+    unsigned Lo16 = Imm & 0xffff;
+    unsigned Hi16 = (Imm >> 16) & 0xffff;
+    LO16 = LO16.addImm(Lo16);
+    HI16 = HI16.addImm(Hi16);
+  } else {
+    const GlobalValue *GV = MO.getGlobal();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
+    HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+  }
+
+  (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  LO16.addImm(Pred).addReg(PredReg);
+  HI16.addImm(Pred).addReg(PredReg);
+
+  TransferImpOps(MI, LO16, HI16);
+  MI.eraseFromParent();
+}
+
+bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
     default:
-      ModifiedOp = false;
-      break;
+      return false;
+    case ARM::Int_eh_sjlj_dispatchsetup: {
+      MachineFunction &MF = *MI.getParent()->getParent();
+      const ARMBaseInstrInfo *AII =
+        static_cast<const ARMBaseInstrInfo*>(TII);
+      const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+      // For functions using a base pointer, we rematerialize it (via the frame
+      // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
+      // for us. Otherwise, expand to nothing.
+      if (RI.hasBasePointer(MF)) {
+        int32_t NumBytes = AFI->getFramePtrSpillOffset();
+        unsigned FramePtr = RI.getFrameRegister(MF);
+        assert(MF.getTarget().getFrameLowering()->hasFP(MF) &&
+               "base pointer without frame pointer?");
+
+        if (AFI->isThumb2Function()) {
+          llvm::emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                       FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
+        } else if (AFI->isThumbFunction()) {
+          llvm::emitThumbRegPlusImmediate(MBB, MBBI, ARM::R6,
+                                          FramePtr, -NumBytes,
+                                          *TII, RI, MI.getDebugLoc());
+        } else {
+          llvm::emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                        FramePtr, -NumBytes, ARMCC::AL, 0,
+                                        *TII);
+        }
+        // If there's dynamic realignment, adjust for it.
+        if (RI.needsStackRealignment(MF)) {
+          MachineFrameInfo  *MFI = MF.getFrameInfo();
+          unsigned MaxAlign = MFI->getMaxAlignment();
+          assert (!AFI->isThumb1OnlyFunction());
+          // Emit bic r6, r6, MaxAlign
+          unsigned bicOpc = AFI->isThumbFunction() ?
+            ARM::t2BICri : ARM::BICri;
+          AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                              TII->get(bicOpc), ARM::R6)
+                                      .addReg(ARM::R6, RegState::Kill)
+                                      .addImm(MaxAlign-1)));
+        }
+
+      }
+      MI.eraseFromParent();
+      return true;
+    }
 
-    case ARM::tLDRpci_pic: 
+    case ARM::MOVsrl_flag:
+    case ARM::MOVsra_flag: {
+      // These are just fancy MOVs insructions.
+      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+                             MI.getOperand(0).getReg())
+                     .addOperand(MI.getOperand(1))
+                     .addReg(0)
+                     .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ? ARM_AM::lsr
+                                                  : ARM_AM::asr), 1)))
+        .addReg(ARM::CPSR, RegState::Define);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::RRX: {
+      // This encodes as "MOVs Rd, Rm, rrx
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+                               MI.getOperand(0).getReg())
+                       .addOperand(MI.getOperand(1))
+                       .addOperand(MI.getOperand(1))
+                       .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
+        .addReg(0);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::TPsoft: {
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(ARM::BL))
+        .addExternalSymbol("__aeabi_read_tp", 0);
+
+      (*MIB).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tLDRpci_pic:
     case ARM::t2LDRpci_pic: {
       unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
         ? ARM::tLDRpci : ARM::t2LDRpci;
@@ -225,54 +803,73 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
       (*MIB1).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                          TII->get(ARM::tPICADD))
-        .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
         .addReg(DstReg)
         .addOperand(MI.getOperand(2));
       TransferImpOps(MI, MIB1, MIB2);
       MI.eraseFromParent();
-      break;
+      return true;
     }
 
-    case ARM::MOVi32imm:
-    case ARM::t2MOVi32imm: {
-      unsigned PredReg = 0;
-      ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+    case ARM::MOV_ga_dyn:
+    case ARM::MOV_ga_pcrel:
+    case ARM::MOV_ga_pcrel_ldr:
+    case ARM::t2MOV_ga_dyn:
+    case ARM::t2MOV_ga_pcrel: {
+      // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
+      unsigned LabelId = AFI->createPICLabelUId();
       unsigned DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
-      const MachineOperand &MO = MI.getOperand(1);
-      MachineInstrBuilder LO16, HI16;
-
-      LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                     TII->get(Opcode == ARM::MOVi32imm ?
-                              ARM::MOVi16 : ARM::t2MOVi16),
-                     DstReg);
-      HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                     TII->get(Opcode == ARM::MOVi32imm ?
-                              ARM::MOVTi16 : ARM::t2MOVTi16))
-        .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
-        .addReg(DstReg);
-
-      if (MO.isImm()) {
-        unsigned Imm = MO.getImm();
-        unsigned Lo16 = Imm & 0xffff;
-        unsigned Hi16 = (Imm >> 16) & 0xffff;
-        LO16 = LO16.addImm(Lo16);
-        HI16 = HI16.addImm(Hi16);
-      } else {
-        const GlobalValue *GV = MO.getGlobal();
-        unsigned TF = MO.getTargetFlags();
-        LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
-        HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+      const MachineOperand &MO1 = MI.getOperand(1);
+      const GlobalValue *GV = MO1.getGlobal();
+      unsigned TF = MO1.getTargetFlags();
+      bool isARM = Opcode != ARM::t2MOV_ga_pcrel;
+      bool isPIC = (Opcode != ARM::MOV_ga_dyn && Opcode != ARM::t2MOV_ga_dyn);
+      unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
+      unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel : ARM::t2MOVTi16_ga_pcrel;
+      unsigned LO16TF = isPIC
+        ? ARMII::MO_LO16_NONLAZY_PIC : ARMII::MO_LO16_NONLAZY;
+      unsigned HI16TF = isPIC
+        ? ARMII::MO_HI16_NONLAZY_PIC : ARMII::MO_HI16_NONLAZY;
+      unsigned PICAddOpc = isARM
+        ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
+        : ARM::tPICADD;
+      MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(LO16Opc), DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF)
+        .addImm(LabelId);
+      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(HI16Opc), DstReg)
+        .addReg(DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF)
+        .addImm(LabelId);
+      if (!isPIC) {
+        TransferImpOps(MI, MIB1, MIB2);
+        MI.eraseFromParent();
+        return true;
       }
-      (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-      (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-      LO16.addImm(Pred).addReg(PredReg);
-      HI16.addImm(Pred).addReg(PredReg);
-      TransferImpOps(MI, LO16, HI16);
+
+      MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(PICAddOpc))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg).addImm(LabelId);
+      if (isARM) {
+        AddDefaultPred(MIB3);
+        if (Opcode == ARM::MOV_ga_pcrel_ldr)
+          (*MIB2).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      }
+      TransferImpOps(MI, MIB1, MIB3);
       MI.eraseFromParent();
-      break;
+      return true;
     }
 
+    case ARM::MOVi32imm:
+    case ARM::MOVCCi32imm:
+    case ARM::t2MOVi32imm:
+    case ARM::t2MOVCCi32imm:
+      ExpandMOV32BitImm(MBB, MBBI);
+      return true;
+
     case ARM::VMOVQQ: {
       unsigned DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
@@ -285,222 +882,339 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
       MachineInstrBuilder Even =
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(ARM::VMOVQ))
-                     .addReg(EvenDst,
-                             getDefRegState(true) | getDeadRegState(DstIsDead))
-                     .addReg(EvenSrc, getKillRegState(SrcIsKill)));
+                       .addReg(EvenDst,
+                               RegState::Define | getDeadRegState(DstIsDead))
+                       .addReg(EvenSrc, getKillRegState(SrcIsKill)));
       MachineInstrBuilder Odd =
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(ARM::VMOVQ))
-                     .addReg(OddDst,
-                             getDefRegState(true) | getDeadRegState(DstIsDead))
-                     .addReg(OddSrc, getKillRegState(SrcIsKill)));
+                       .addReg(OddDst,
+                               RegState::Define | getDeadRegState(DstIsDead))
+                       .addReg(OddSrc, getKillRegState(SrcIsKill)));
       TransferImpOps(MI, Even, Odd);
       MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VLDMQIA:
+    case ARM::VLDMQDB: {
+      unsigned NewOpc = (Opcode == ARM::VLDMQIA) ? ARM::VLDMDIA : ARM::VLDMDDB;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register destination.
+      bool DstIsDead = MI.getOperand(OpIdx).isDead();
+      unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the source register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the destination operands (D subregs).
+      unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+      MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+
+      // Add an implicit def for the super-register.
+      MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VSTMQIA:
+    case ARM::VSTMQDB: {
+      unsigned NewOpc = (Opcode == ARM::VSTMQIA) ? ARM::VSTMDIA : ARM::VSTMDDB;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register source.
+      bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+      unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the destination register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the source operands (D subregs).
+      unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+      MIB.addReg(D0).addReg(D1);
+
+      if (SrcIsKill)
+        // Add an implicit kill for the Q register.
+        (*MIB).addRegisterKilled(SrcReg, TRI, true);
+
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::VDUPfqf:
+    case ARM::VDUPfdf:{
+      unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLNfq : ARM::VDUPLNfd;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned Lane = getARMRegisterNumbering(SrcReg) & 1;
+      unsigned DReg = TRI->getMatchingSuperReg(SrcReg,
+                                               Lane & 1 ? ARM::ssub_1 : ARM::ssub_0, &ARM::DPR_VFP2RegClass);
+      // The lane is [0,1] for the containing DReg superregister.
+      // Copy the dst/src register operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addReg(DReg);
+      ++OpIdx;
+      // Add the lane select operand.
+      MIB.addImm(Lane);
+      // Add the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
     }
 
     case ARM::VLD1q8Pseudo:
-      ExpandVLD(MBBI, ARM::VLD1q8, false, SingleSpc, 2); break;
     case ARM::VLD1q16Pseudo:
-      ExpandVLD(MBBI, ARM::VLD1q16, false, SingleSpc, 2); break;
     case ARM::VLD1q32Pseudo:
-      ExpandVLD(MBBI, ARM::VLD1q32, false, SingleSpc, 2); break;
     case ARM::VLD1q64Pseudo:
-      ExpandVLD(MBBI, ARM::VLD1q64, false, SingleSpc, 2); break;
     case ARM::VLD1q8Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD1q8, true, SingleSpc, 2); break;
     case ARM::VLD1q16Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD1q16, true, SingleSpc, 2); break;
     case ARM::VLD1q32Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD1q32, true, SingleSpc, 2); break;
     case ARM::VLD1q64Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD1q64, true, SingleSpc, 2); break;
-
     case ARM::VLD2d8Pseudo:
-      ExpandVLD(MBBI, ARM::VLD2d8, false, SingleSpc, 2); break;
     case ARM::VLD2d16Pseudo:
-      ExpandVLD(MBBI, ARM::VLD2d16, false, SingleSpc, 2); break;
     case ARM::VLD2d32Pseudo:
-      ExpandVLD(MBBI, ARM::VLD2d32, false, SingleSpc, 2); break;
     case ARM::VLD2q8Pseudo:
-      ExpandVLD(MBBI, ARM::VLD2q8, false, SingleSpc, 4); break;
     case ARM::VLD2q16Pseudo:
-      ExpandVLD(MBBI, ARM::VLD2q16, false, SingleSpc, 4); break;
     case ARM::VLD2q32Pseudo:
-      ExpandVLD(MBBI, ARM::VLD2q32, false, SingleSpc, 4); break;
     case ARM::VLD2d8Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD2d8, true, SingleSpc, 2); break;
     case ARM::VLD2d16Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD2d16, true, SingleSpc, 2); break;
     case ARM::VLD2d32Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD2d32, true, SingleSpc, 2); break;
     case ARM::VLD2q8Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD2q8, true, SingleSpc, 4); break;
     case ARM::VLD2q16Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD2q16, true, SingleSpc, 4); break;
     case ARM::VLD2q32Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD2q32, true, SingleSpc, 4); break;
-
     case ARM::VLD3d8Pseudo:
-      ExpandVLD(MBBI, ARM::VLD3d8, false, SingleSpc, 3); break;
     case ARM::VLD3d16Pseudo:
-      ExpandVLD(MBBI, ARM::VLD3d16, false, SingleSpc, 3); break;
     case ARM::VLD3d32Pseudo:
-      ExpandVLD(MBBI, ARM::VLD3d32, false, SingleSpc, 3); break;
     case ARM::VLD1d64TPseudo:
-      ExpandVLD(MBBI, ARM::VLD1d64T, false, SingleSpc, 3); break;
     case ARM::VLD3d8Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3d8_UPD, true, SingleSpc, 3); break;
     case ARM::VLD3d16Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3d16_UPD, true, SingleSpc, 3); break;
     case ARM::VLD3d32Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3d32_UPD, true, SingleSpc, 3); break;
     case ARM::VLD1d64TPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD1d64T_UPD, true, SingleSpc, 3); break;
     case ARM::VLD3q8Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, EvenDblSpc, 3); break;
     case ARM::VLD3q16Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, EvenDblSpc, 3); break;
     case ARM::VLD3q32Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
     case ARM::VLD3q8oddPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, OddDblSpc, 3); break;
     case ARM::VLD3q16oddPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, OddDblSpc, 3); break;
     case ARM::VLD3q32oddPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, OddDblSpc, 3); break;
-
     case ARM::VLD4d8Pseudo:
-      ExpandVLD(MBBI, ARM::VLD4d8, false, SingleSpc, 4); break;
     case ARM::VLD4d16Pseudo:
-      ExpandVLD(MBBI, ARM::VLD4d16, false, SingleSpc, 4); break;
     case ARM::VLD4d32Pseudo:
-      ExpandVLD(MBBI, ARM::VLD4d32, false, SingleSpc, 4); break;
     case ARM::VLD1d64QPseudo:
-      ExpandVLD(MBBI, ARM::VLD1d64Q, false, SingleSpc, 4); break;
     case ARM::VLD4d8Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4d8_UPD, true, SingleSpc, 4); break;
     case ARM::VLD4d16Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4d16_UPD, true, SingleSpc, 4); break;
     case ARM::VLD4d32Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4d32_UPD, true, SingleSpc, 4); break;
     case ARM::VLD1d64QPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD1d64Q_UPD, true, SingleSpc, 4); break;
     case ARM::VLD4q8Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, EvenDblSpc, 4); break;
     case ARM::VLD4q16Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, EvenDblSpc, 4); break;
     case ARM::VLD4q32Pseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
     case ARM::VLD4q8oddPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, OddDblSpc, 4); break;
     case ARM::VLD4q16oddPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, OddDblSpc, 4); break;
     case ARM::VLD4q32oddPseudo_UPD:
-      ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, OddDblSpc, 4); break;
+    case ARM::VLD1DUPq8Pseudo:
+    case ARM::VLD1DUPq16Pseudo:
+    case ARM::VLD1DUPq32Pseudo:
+    case ARM::VLD1DUPq8Pseudo_UPD:
+    case ARM::VLD1DUPq16Pseudo_UPD:
+    case ARM::VLD1DUPq32Pseudo_UPD:
+    case ARM::VLD2DUPd8Pseudo:
+    case ARM::VLD2DUPd16Pseudo:
+    case ARM::VLD2DUPd32Pseudo:
+    case ARM::VLD2DUPd8Pseudo_UPD:
+    case ARM::VLD2DUPd16Pseudo_UPD:
+    case ARM::VLD2DUPd32Pseudo_UPD:
+    case ARM::VLD3DUPd8Pseudo:
+    case ARM::VLD3DUPd16Pseudo:
+    case ARM::VLD3DUPd32Pseudo:
+    case ARM::VLD3DUPd8Pseudo_UPD:
+    case ARM::VLD3DUPd16Pseudo_UPD:
+    case ARM::VLD3DUPd32Pseudo_UPD:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+      ExpandVLD(MBBI);
+      return true;
 
     case ARM::VST1q8Pseudo:
-      ExpandVST(MBBI, ARM::VST1q8, false, SingleSpc, 2); break;
     case ARM::VST1q16Pseudo:
-      ExpandVST(MBBI, ARM::VST1q16, false, SingleSpc, 2); break;
     case ARM::VST1q32Pseudo:
-      ExpandVST(MBBI, ARM::VST1q32, false, SingleSpc, 2); break;
     case ARM::VST1q64Pseudo:
-      ExpandVST(MBBI, ARM::VST1q64, false, SingleSpc, 2); break;
     case ARM::VST1q8Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST1q8_UPD, true, SingleSpc, 2); break;
     case ARM::VST1q16Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST1q16_UPD, true, SingleSpc, 2); break;
     case ARM::VST1q32Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST1q32_UPD, true, SingleSpc, 2); break;
     case ARM::VST1q64Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST1q64_UPD, true, SingleSpc, 2); break;
-
     case ARM::VST2d8Pseudo:
-      ExpandVST(MBBI, ARM::VST2d8, false, SingleSpc, 2); break;
     case ARM::VST2d16Pseudo:
-      ExpandVST(MBBI, ARM::VST2d16, false, SingleSpc, 2); break;
     case ARM::VST2d32Pseudo:
-      ExpandVST(MBBI, ARM::VST2d32, false, SingleSpc, 2); break;
     case ARM::VST2q8Pseudo:
-      ExpandVST(MBBI, ARM::VST2q8, false, SingleSpc, 4); break;
     case ARM::VST2q16Pseudo:
-      ExpandVST(MBBI, ARM::VST2q16, false, SingleSpc, 4); break;
     case ARM::VST2q32Pseudo:
-      ExpandVST(MBBI, ARM::VST2q32, false, SingleSpc, 4); break;
     case ARM::VST2d8Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST2d8_UPD, true, SingleSpc, 2); break;
     case ARM::VST2d16Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST2d16_UPD, true, SingleSpc, 2); break;
     case ARM::VST2d32Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST2d32_UPD, true, SingleSpc, 2); break;
     case ARM::VST2q8Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST2q8_UPD, true, SingleSpc, 4); break;
     case ARM::VST2q16Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST2q16_UPD, true, SingleSpc, 4); break;
     case ARM::VST2q32Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST2q32_UPD, true, SingleSpc, 4); break;
-
     case ARM::VST3d8Pseudo:
-      ExpandVST(MBBI, ARM::VST3d8, false, SingleSpc, 3); break;
     case ARM::VST3d16Pseudo:
-      ExpandVST(MBBI, ARM::VST3d16, false, SingleSpc, 3); break;
     case ARM::VST3d32Pseudo:
-      ExpandVST(MBBI, ARM::VST3d32, false, SingleSpc, 3); break;
     case ARM::VST1d64TPseudo:
-      ExpandVST(MBBI, ARM::VST1d64T, false, SingleSpc, 3); break;
     case ARM::VST3d8Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3d8_UPD, true, SingleSpc, 3); break;
     case ARM::VST3d16Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3d16_UPD, true, SingleSpc, 3); break;
     case ARM::VST3d32Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3d32_UPD, true, SingleSpc, 3); break;
     case ARM::VST1d64TPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST1d64T_UPD, true, SingleSpc, 3); break;
     case ARM::VST3q8Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3q8_UPD, true, EvenDblSpc, 3); break;
     case ARM::VST3q16Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3q16_UPD, true, EvenDblSpc, 3); break;
     case ARM::VST3q32Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3q32_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VST3q8oddPseudo:
+    case ARM::VST3q16oddPseudo:
+    case ARM::VST3q32oddPseudo:
     case ARM::VST3q8oddPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3q8_UPD, true, OddDblSpc, 3); break;
     case ARM::VST3q16oddPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3q16_UPD, true, OddDblSpc, 3); break;
     case ARM::VST3q32oddPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST3q32_UPD, true, OddDblSpc, 3); break;
-
     case ARM::VST4d8Pseudo:
-      ExpandVST(MBBI, ARM::VST4d8, false, SingleSpc, 4); break;
     case ARM::VST4d16Pseudo:
-      ExpandVST(MBBI, ARM::VST4d16, false, SingleSpc, 4); break;
     case ARM::VST4d32Pseudo:
-      ExpandVST(MBBI, ARM::VST4d32, false, SingleSpc, 4); break;
     case ARM::VST1d64QPseudo:
-      ExpandVST(MBBI, ARM::VST1d64Q, false, SingleSpc, 4); break;
     case ARM::VST4d8Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4d8_UPD, true, SingleSpc, 4); break;
     case ARM::VST4d16Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4d16_UPD, true, SingleSpc, 4); break;
     case ARM::VST4d32Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4d32_UPD, true, SingleSpc, 4); break;
     case ARM::VST1d64QPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST1d64Q_UPD, true, SingleSpc, 4); break;
     case ARM::VST4q8Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4q8_UPD, true, EvenDblSpc, 4); break;
     case ARM::VST4q16Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4q16_UPD, true, EvenDblSpc, 4); break;
     case ARM::VST4q32Pseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4q32_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VST4q8oddPseudo:
+    case ARM::VST4q16oddPseudo:
+    case ARM::VST4q32oddPseudo:
     case ARM::VST4q8oddPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4q8_UPD, true, OddDblSpc, 4); break;
     case ARM::VST4q16oddPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4q16_UPD, true, OddDblSpc, 4); break;
     case ARM::VST4q32oddPseudo_UPD:
-      ExpandVST(MBBI, ARM::VST4q32_UPD, true, OddDblSpc, 4); break;
-    }
+      ExpandVST(MBBI);
+      return true;
+
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD3LNd8Pseudo:
+    case ARM::VLD3LNd16Pseudo:
+    case ARM::VLD3LNd32Pseudo:
+    case ARM::VLD3LNq16Pseudo:
+    case ARM::VLD3LNq32Pseudo:
+    case ARM::VLD3LNd8Pseudo_UPD:
+    case ARM::VLD3LNd16Pseudo_UPD:
+    case ARM::VLD3LNd32Pseudo_UPD:
+    case ARM::VLD3LNq16Pseudo_UPD:
+    case ARM::VLD3LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+    case ARM::VST1LNq8Pseudo:
+    case ARM::VST1LNq16Pseudo:
+    case ARM::VST1LNq32Pseudo:
+    case ARM::VST1LNq8Pseudo_UPD:
+    case ARM::VST1LNq16Pseudo_UPD:
+    case ARM::VST1LNq32Pseudo_UPD:
+    case ARM::VST2LNd8Pseudo:
+    case ARM::VST2LNd16Pseudo:
+    case ARM::VST2LNd32Pseudo:
+    case ARM::VST2LNq16Pseudo:
+    case ARM::VST2LNq32Pseudo:
+    case ARM::VST2LNd8Pseudo_UPD:
+    case ARM::VST2LNd16Pseudo_UPD:
+    case ARM::VST2LNd32Pseudo_UPD:
+    case ARM::VST2LNq16Pseudo_UPD:
+    case ARM::VST2LNq32Pseudo_UPD:
+    case ARM::VST3LNd8Pseudo:
+    case ARM::VST3LNd16Pseudo:
+    case ARM::VST3LNd32Pseudo:
+    case ARM::VST3LNq16Pseudo:
+    case ARM::VST3LNq32Pseudo:
+    case ARM::VST3LNd8Pseudo_UPD:
+    case ARM::VST3LNd16Pseudo_UPD:
+    case ARM::VST3LNd32Pseudo_UPD:
+    case ARM::VST3LNq16Pseudo_UPD:
+    case ARM::VST3LNq32Pseudo_UPD:
+    case ARM::VST4LNd8Pseudo:
+    case ARM::VST4LNd16Pseudo:
+    case ARM::VST4LNd32Pseudo:
+    case ARM::VST4LNq16Pseudo:
+    case ARM::VST4LNq32Pseudo:
+    case ARM::VST4LNd8Pseudo_UPD:
+    case ARM::VST4LNd16Pseudo_UPD:
+    case ARM::VST4LNd32Pseudo_UPD:
+    case ARM::VST4LNq16Pseudo_UPD:
+    case ARM::VST4LNq32Pseudo_UPD:
+      ExpandLaneOp(MBBI);
+      return true;
+
+    case ARM::VTBL2Pseudo: ExpandVTBL(MBBI, ARM::VTBL2, false, 2); return true;
+    case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false, 3); return true;
+    case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false, 4); return true;
+    case ARM::VTBX2Pseudo: ExpandVTBL(MBBI, ARM::VTBX2, true, 2); return true;
+    case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true, 3); return true;
+    case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true, 4); return true;
+  }
+
+  return false;
+}
 
-    if (ModifiedOp)
-      Modified = true;
+bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
+    Modified |= ExpandMI(MBB, MBBI);
     MBBI = NMBBI;
   }
 
@@ -508,8 +1222,11 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
 }
 
 bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+  STI = &TM.getSubtarget<ARMSubtarget>();
+  AFI = MF.getInfo<ARMFunctionInfo>();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
index 4892eae..9f29530 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -15,14 +15,17 @@
 
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMCallingConv.h"
 #include "ARMRegisterInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMSubtarget.h"
+#include "ARMConstantPoolValue.h"
 #include "llvm/CallingConv.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -30,7 +33,9 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -43,12 +48,37 @@
 using namespace llvm;
 
 static cl::opt<bool>
-EnableARMFastISel("arm-fast-isel",
-                  cl::desc("Turn on experimental ARM fast-isel support"),
-                  cl::init(false), cl::Hidden);
+DisableARMFastISel("disable-arm-fast-isel",
+                    cl::desc("Turn off experimental ARM fast-isel support"),
+                    cl::init(false), cl::Hidden);
+
+extern cl::opt<bool> EnableARMLongCalls;
 
 namespace {
 
+  // All possible address modes, plus some.
+  typedef struct Address {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int Offset;
+    unsigned Scale;
+    unsigned PlusReg;
+
+    // Innocuous defaults for our address.
+    Address()
+     : BaseType(RegBase), Offset(0), Scale(0), PlusReg(0) {
+       Base.Reg = 0;
+     }
+  } Address;
+
 class ARMFastISel : public FastISel {
 
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
@@ -57,13 +87,14 @@ class ARMFastISel : public FastISel {
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
-  const ARMFunctionInfo *AFI;
+  ARMFunctionInfo *AFI;
 
-  // Convenience variable to avoid checking all the time.
+  // Convenience variables to avoid some queries.
   bool isThumb;
+  LLVMContext *Context;
 
   public:
-    explicit ARMFastISel(FunctionLoweringInfo &funcInfo) 
+    explicit ARMFastISel(FunctionLoweringInfo &funcInfo)
     : FastISel(funcInfo),
       TM(funcInfo.MF->getTarget()),
       TII(*TM.getInstrInfo()),
@@ -71,6 +102,7 @@ class ARMFastISel : public FastISel {
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
       AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
       isThumb = AFI->isThumbFunction();
+      Context = &funcInfo.Fn->getContext();
     }
 
     // Code from FastISel.cpp.
@@ -102,36 +134,73 @@ class ARMFastISel : public FastISel {
     virtual unsigned FastEmitInst_extractsubreg(MVT RetVT,
                                                 unsigned Op0, bool Op0IsKill,
                                                 uint32_t Idx);
-                                                
+
     // Backend specific FastISel code.
     virtual bool TargetSelectInstruction(const Instruction *I);
     virtual unsigned TargetMaterializeConstant(const Constant *C);
+    virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
 
   #include "ARMGenFastISel.inc"
-  
+
     // Instruction selection routines.
-    virtual bool ARMSelectLoad(const Instruction *I);
-    virtual bool ARMSelectStore(const Instruction *I);
-    virtual bool ARMSelectBranch(const Instruction *I);
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectBinaryOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectSIToFP(const Instruction *I);
+    bool SelectFPToSI(const Instruction *I);
+    bool SelectSDiv(const Instruction *I);
+    bool SelectSRem(const Instruction *I);
+    bool SelectCall(const Instruction *I);
+    bool SelectSelect(const Instruction *I);
+    bool SelectRet(const Instruction *I);
 
     // Utility routines.
   private:
-    bool isTypeLegal(const Type *Ty, EVT &VT);
-    bool isLoadTypeLegal(const Type *Ty, EVT &VT);
-    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, unsigned Reg, int Offset);
-    bool ARMEmitStore(EVT VT, unsigned SrcReg, unsigned Reg, int Offset);
-    bool ARMLoadAlloca(const Instruction *I);
-    bool ARMStoreAlloca(const Instruction *I, unsigned SrcReg);
-    bool ARMComputeRegOffset(const Value *Obj, unsigned &Reg, int &Offset);
-    bool ARMMaterializeConstant(const ConstantInt *Val, unsigned &Reg);
-    
+    bool isTypeLegal(const Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(const Type *Ty, MVT &VT);
+    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr);
+    bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr);
+    bool ARMComputeAddress(const Value *Obj, Address &Addr);
+    void ARMSimplifyAddress(Address &Addr, EVT VT);
+    unsigned ARMMaterializeFP(const ConstantFP *CFP, EVT VT);
+    unsigned ARMMaterializeInt(const Constant *C, EVT VT);
+    unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT);
+    unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg);
+    unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg);
+
+    // Call handling routines.
+  private:
+    bool FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+                        unsigned &ResultReg);
+    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return);
+    bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes);
+    bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes);
+    bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
+
+    // OptionalDef handling routines.
+  private:
     bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
+    void AddLoadStoreOperands(EVT VT, Address &Addr,
+                              const MachineInstrBuilder &MIB);
 };
 
 } // end anonymous namespace
 
-// #include "ARMGenCallingConv.inc"
+#include "ARMGenCallingConv.inc"
 
 // DefinesOptionalPredicate - This is different from DefinesPredicate in that
 // we don't care about implicit defs here, just places we'll need to add a
@@ -153,6 +222,9 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
 
 // If the machine is predicable go ahead and add the predicate operands, if
 // it needs default CC operands add those.
+// TODO: If we want to support thumb1 then we'll need to deal with optional
+// CPSR defs that need to be added before the remaining operands. See s_cc_out
+// for descriptions why.
 const MachineInstrBuilder &
 ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   MachineInstr *MI = &*MIB;
@@ -160,7 +232,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   // Do we use a predicate?
   if (TII.isPredicable(MI))
     AddDefaultPred(MIB);
-  
+
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
   // defines CPSR. All other OptionalDefines in ARM are the CCR register.
   bool CPSR = false;
@@ -297,7 +369,7 @@ unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
                                      uint64_t Imm) {
   unsigned ResultReg = createResultReg(RC);
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
-  
+
   if (II.getNumDefs() >= 1)
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addImm(Imm));
@@ -323,16 +395,84 @@ unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT,
   return ResultReg;
 }
 
-unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
-  EVT VT = TLI.getValueType(C->getType(), true);
+// TODO: Don't worry about 64-bit now, but when this is fixed remove the
+// checks from the various callers.
+unsigned ARMFastISel::ARMMoveToFPReg(EVT VT, unsigned SrcReg) {
+  if (VT == MVT::f64) return 0;
 
-  // Only handle simple types.
-  if (!VT.isSimple()) return 0;
-  
-  // TODO: This should be safe for fp because they're just bits from the
-  // Constant.
-  // TODO: Theoretically we could materialize fp constants with instructions
-  // from VFP3.
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VMOVRS), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+unsigned ARMFastISel::ARMMoveToIntReg(EVT VT, unsigned SrcReg) {
+  if (VT == MVT::i64) return 0;
+
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VMOVSR), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+// For double width floating point we need to materialize two constants
+// (the high and the low) into integer registers then use a move to get
+// the combined constant into an FP reg.
+unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, EVT VT) {
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = VT == MVT::f64;
+
+  // This checks to see if we can use VFP3 instructions to materialize
+  // a constant, otherwise we have to go through the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    unsigned Opc = is64bit ? ARM::FCONSTD : ARM::FCONSTS;
+    unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                            DestReg)
+                    .addFPImm(CFP));
+    return DestReg;
+  }
+
+  // Require VFP2 for loading fp constants.
+  if (!Subtarget->hasVFP2()) return false;
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = TD.getTypeAllocSize(CFP->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
+
+  // The extra reg is for addrmode5.
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                          DestReg)
+                  .addConstantPoolIndex(Idx)
+                  .addReg(0));
+  return DestReg;
+}
+
+unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, EVT VT) {
+
+  // For now 32-bit only.
+  if (VT != MVT::i32) return false;
+
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+
+  // If we can do this in a single instruction without a constant pool entry
+  // do so now.
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getSExtValue())) {
+    unsigned Opc = isThumb ? ARM::t2MOVi16 : ARM::MOVi16;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(Opc), DestReg)
+                    .addImm(CI->getSExtValue()));
+    return DestReg;
+  }
 
   // MachineConstantPool wants an explicit alignment.
   unsigned Align = TD.getPrefTypeAlignment(C->getType());
@@ -342,58 +482,144 @@ unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
   }
   unsigned Idx = MCP.getConstantPoolIndex(C, Align);
 
-  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
-  // Different addressing modes between ARM/Thumb2 for constant pool loads.
   if (isThumb)
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(ARM::t2LDRpci))
-                    .addReg(DestReg).addConstantPoolIndex(Idx));
+                            TII.get(ARM::t2LDRpci), DestReg)
+                    .addConstantPoolIndex(Idx));
   else
+    // The extra immediate is for addrmode2.
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(ARM::LDRcp))
-                    .addReg(DestReg).addConstantPoolIndex(Idx)
-                    .addReg(0).addImm(0));
-    
+                            TII.get(ARM::LDRcp), DestReg)
+                    .addConstantPoolIndex(Idx)
+                    .addImm(0));
+
   return DestReg;
 }
 
-bool ARMFastISel::isTypeLegal(const Type *Ty, EVT &VT) {
-  VT = TLI.getValueType(Ty, true);
-  
+unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32) return 0;
+
+  Reloc::Model RelocM = TM.getRelocationModel();
+
+  // TODO: No external globals for now.
+  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) return 0;
+
+  // TODO: Need more magic for ARM PIC.
+  if (!isThumb && (RelocM == Reloc::PIC_)) return 0;
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(GV->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = TD.getTypeAllocSize(GV->getType());
+  }
+
+  // Grab index.
+  unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb() ? 4 : 8);
+  unsigned Id = AFI->createPICLabelUId();
+  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, Id,
+                                                       ARMCP::CPValue, PCAdj);
+  unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+  // Load value.
+  MachineInstrBuilder MIB;
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  if (isThumb) {
+    unsigned Opc = (RelocM != Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+          .addConstantPoolIndex(Idx);
+    if (RelocM == Reloc::PIC_)
+      MIB.addImm(Id);
+  } else {
+    // The extra immediate is for addrmode2.
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
+                  DestReg)
+          .addConstantPoolIndex(Idx)
+          .addImm(0);
+  }
+  AddOptionalDefs(MIB);
+  return DestReg;
+}
+
+unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT VT = TLI.getValueType(C->getType(), true);
+
   // Only handle simple types.
-  if (VT == MVT::Other || !VT.isSimple()) return false;
-    
+  if (!VT.isSimple()) return 0;
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return ARMMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return ARMMaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return ARMMaterializeInt(C, VT);
+
+  return 0;
+}
+
+unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return false;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  // This will get lowered later into the correct offsets and registers
+  // via rewriteXFrameIndex.
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    TargetRegisterClass* RC = TLI.getRegClassFor(VT);
+    unsigned ResultReg = createResultReg(RC);
+    unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(SI->second)
+                            .addImm(0));
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+bool ARMFastISel::isTypeLegal(const Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple()) return false;
+  VT = evt.getSimpleVT();
+
   // Handle all legal types, i.e. a register that will directly hold this
   // value.
   return TLI.isTypeLegal(VT);
 }
 
-bool ARMFastISel::isLoadTypeLegal(const Type *Ty, EVT &VT) {
+bool ARMFastISel::isLoadTypeLegal(const Type *Ty, MVT &VT) {
   if (isTypeLegal(Ty, VT)) return true;
-  
+
   // If this is a type than can be sign or zero-extended to a basic operation
   // go ahead and accept it now.
   if (VT == MVT::i8 || VT == MVT::i16)
     return true;
-  
+
   return false;
 }
 
-// Computes the Reg+Offset to get to an object.
-bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, unsigned &Reg,
-                                      int &Offset) {
+// Computes the address to get to an object.
+bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
   // Some boilerplate from the X86 FastISel.
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
-    // Don't walk into other basic blocks; it's possible we haven't
-    // visited them yet, so the instructions may not yet be assigned
-    // virtual registers.
-    if (FuncInfo.MBBMap[I->getParent()] != FuncInfo.MBB)
-      return false;
-
-    Opcode = I->getOpcode();
-    U = I;
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
     Opcode = C->getOpcode();
     U = C;
@@ -404,141 +630,282 @@ bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, unsigned &Reg,
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
-  
+
   switch (Opcode) {
-    default: 
-    //errs() << "Failing Opcode is: " << *Op1 << "\n";
+    default:
     break;
+    case Instruction::BitCast: {
+      // Look through bitcasts.
+      return ARMComputeAddress(U->getOperand(0), Addr);
+    }
+    case Instruction::IntToPtr: {
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    }
+    case Instruction::PtrToInt: {
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    }
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      int TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+           i != e; ++i, ++GTI) {
+        const Value *Op = *i;
+        if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = TD.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          SmallVector<const Value *, 4> Worklist;
+          Worklist.push_back(Op);
+          do {
+            Op = Worklist.pop_back_val();
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+            } else if (isa<AddOperator>(Op) &&
+                       isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+              // An add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Add the other operand back to the work list.
+              Worklist.push_back(cast<AddOperator>(Op)->getOperand(0));
+            } else
+              goto unsupported_gep;
+          } while (!Worklist.empty());
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (ARMComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
     case Instruction::Alloca: {
-      assert(false && "Alloca should have been handled earlier!");
-      return false;
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
     }
   }
-  
+
+  // Materialize the global variable's address into a reg which can
+  // then be used later to load the variable.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) {
-    //errs() << "Failing GV is: " << GV << "\n";
-    (void)GV;
-    return false;
+    unsigned Tmp = ARMMaterializeGV(GV, TLI.getValueType(Obj->getType()));
+    if (Tmp == 0) return false;
+
+    Addr.Base.Reg = Tmp;
+    return true;
   }
-  
+
   // Try to get this in a register if nothing else has worked.
-  Reg = getRegForValue(Obj);
-  if (Reg == 0) return false;
+  if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj);
+  return Addr.Base.Reg != 0;
+}
+
+void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) {
 
-  // Since the offset may be too large for the load instruction
+  assert(VT.isSimple() && "Non-simple types are invalid here!");
+
+  bool needsLowering = false;
+  switch (VT.getSimpleVT().SimpleTy) {
+    default:
+      assert(false && "Unhandled load/store type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      // Integer loads/stores handle 12-bit offsets.
+      needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset);
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      // Floating point operands handle 8-bit offsets.
+      needsLowering = ((Addr.Offset & 0xff) != Addr.Offset);
+      break;
+  }
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
+    TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass :
+                              ARM::GPRRegisterClass;
+    unsigned ResultReg = createResultReg(RC);
+    unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(Addr.Base.FI)
+                            .addImm(0));
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  // Since the offset is too large for the load/store instruction
   // get the reg+offset into a register.
-  // TODO: Verify the additions work, otherwise we'll need to add the
-  // offset instead of 0 to the instructions and do all sorts of operand
-  // munging.
-  // TODO: Optimize this somewhat.
-  if (Offset != 0) {
+  if (needsLowering) {
     ARMCC::CondCodes Pred = ARMCC::AL;
     unsigned PredReg = 0;
 
+    TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass :
+      ARM::GPRRegisterClass;
+    unsigned BaseReg = createResultReg(RC);
+
     if (!isThumb)
       emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                              Reg, Reg, Offset, Pred, PredReg,
+                              BaseReg, Addr.Base.Reg, Addr.Offset,
+                              Pred, PredReg,
                               static_cast<const ARMBaseInstrInfo&>(TII));
     else {
       assert(AFI->isThumb2Function());
       emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                             Reg, Reg, Offset, Pred, PredReg,
+                             BaseReg, Addr.Base.Reg, Addr.Offset, Pred, PredReg,
                              static_cast<const ARMBaseInstrInfo&>(TII));
     }
+    Addr.Offset = 0;
+    Addr.Base.Reg = BaseReg;
   }
-  
-  return true;
 }
 
-bool ARMFastISel::ARMLoadAlloca(const Instruction *I) {
-  Value *Op0 = I->getOperand(0);
+void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
+                                       const MachineInstrBuilder &MIB) {
+  // addrmode5 output depends on the selection dag addressing dividing the
+  // offset by 4 that it then later multiplies. Do this here as well.
+  if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
+      VT.getSimpleVT().SimpleTy == MVT::f64)
+    Addr.Offset /= 4;
+    
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    int FI = Addr.Base.FI;
+    int Offset = Addr.Offset;
+    MachineMemOperand *MMO =
+          FuncInfo.MF->getMachineMemOperand(
+                                  MachinePointerInfo::getFixedStack(FI, Offset),
+                                  MachineMemOperand::MOLoad,
+                                  MFI.getObjectSize(FI),
+                                  MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI);
 
-  // Verify it's an alloca.
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Op0)) {
-    DenseMap<const AllocaInst*, int>::iterator SI =
-      FuncInfo.StaticAllocaMap.find(AI);
-
-    if (SI != FuncInfo.StaticAllocaMap.end()) {
-      TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
-      unsigned ResultReg = createResultReg(RC);
-      TII.loadRegFromStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt,
-                               ResultReg, SI->second, RC,
-                               TM.getRegisterInfo());
-      UpdateValueMap(I, ResultReg);
-      return true;
-    }
+    // ARM halfword load/stores need an additional operand.
+    if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0);
+
+    MIB.addImm(Addr.Offset);
+    MIB.addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.Base.Reg);
+  
+    // ARM halfword load/stores need an additional operand.
+    if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0);
+
+    MIB.addImm(Addr.Offset);
   }
-  return false;
+  AddOptionalDefs(MIB);
 }
 
-bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg,
-                              unsigned Reg, int Offset) {
-  
+bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) {
+
   assert(VT.isSimple() && "Non-simple types are invalid here!");
   unsigned Opc;
-  
+  TargetRegisterClass *RC;
   switch (VT.getSimpleVT().SimpleTy) {
-    default: 
-      assert(false && "Trying to emit for an unhandled type!");
-      return false;
+    // This is mostly going to be Neon/vector support.
+    default: return false;
     case MVT::i16:
-      Opc = isThumb ? ARM::tLDRH : ARM::LDRH;
-      VT = MVT::i32;
+      Opc = isThumb ? ARM::t2LDRHi12 : ARM::LDRH;
+      RC = ARM::GPRRegisterClass;
       break;
     case MVT::i8:
-      Opc = isThumb ? ARM::tLDRB : ARM::LDRB;
-      VT = MVT::i32;
+      Opc = isThumb ? ARM::t2LDRBi12 : ARM::LDRBi12;
+      RC = ARM::GPRRegisterClass;
       break;
     case MVT::i32:
-      Opc = isThumb ? ARM::tLDR : ARM::LDR;
+      Opc = isThumb ? ARM::t2LDRi12 : ARM::LDRi12;
+      RC = ARM::GPRRegisterClass;
+      break;
+    case MVT::f32:
+      Opc = ARM::VLDRS;
+      RC = TLI.getRegClassFor(VT);
+      break;
+    case MVT::f64:
+      Opc = ARM::VLDRD;
+      RC = TLI.getRegClassFor(VT);
       break;
   }
-  
-  ResultReg = createResultReg(TLI.getRegClassFor(VT));
-  
-  // TODO: Fix the Addressing modes so that these can share some code.
-  // Since this is a Thumb1 load this will work in Thumb1 or 2 mode.
-  if (isThumb)
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(Opc), ResultReg)
-                    .addReg(Reg).addImm(Offset).addReg(0));
-  else
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(Opc), ResultReg)
-                    .addReg(Reg).addReg(0).addImm(Offset));
-                    
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT);
+
+  // Create the base instruction, then add the operands.
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(VT, Addr, MIB);
   return true;
 }
 
-bool ARMFastISel::ARMStoreAlloca(const Instruction *I, unsigned SrcReg) {
-  Value *Op1 = I->getOperand(1);
+bool ARMFastISel::SelectLoad(const Instruction *I) {
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
 
-  // Verify it's an alloca.
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) {
-    DenseMap<const AllocaInst*, int>::iterator SI =
-      FuncInfo.StaticAllocaMap.find(AI);
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
 
-    if (SI != FuncInfo.StaticAllocaMap.end()) {
-      TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
-      assert(SrcReg != 0 && "Nothing to store!");
-      TII.storeRegToStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt,
-                              SrcReg, true /*isKill*/, SI->second, RC,
-                              TM.getRegisterInfo());
-      return true;
-    }
-  }
-  return false;
+  unsigned ResultReg;
+  if (!ARMEmitLoad(VT, ResultReg, Addr)) return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
 }
 
-bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg,
-                               unsigned DstReg, int Offset) {
+bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
   unsigned StrOpc;
   switch (VT.getSimpleVT().SimpleTy) {
+    // This is mostly going to be Neon/vector support.
     default: return false;
-    case MVT::i1:
-    case MVT::i8: StrOpc = isThumb ? ARM::tSTRB : ARM::STRB; break;
-    case MVT::i16: StrOpc = isThumb ? ARM::tSTRH : ARM::STRH; break;
-    case MVT::i32: StrOpc = isThumb ? ARM::tSTR : ARM::STR; break;
+    case MVT::i1: {
+      unsigned Res = createResultReg(isThumb ? ARM::tGPRRegisterClass :
+                                               ARM::GPRRegisterClass);
+      unsigned Opc = isThumb ? ARM::t2ANDri : ARM::ANDri;
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(Opc), Res)
+                      .addReg(SrcReg).addImm(1));
+      SrcReg = Res;
+    } // Fallthrough here.
+    case MVT::i8:
+      StrOpc = isThumb ? ARM::t2STRBi12 : ARM::STRBi12;
+      break;
+    case MVT::i16:
+      StrOpc = isThumb ? ARM::t2STRHi12 : ARM::STRH;
+      break;
+    case MVT::i32:
+      StrOpc = isThumb ? ARM::t2STRi12 : ARM::STRi12;
+      break;
     case MVT::f32:
       if (!Subtarget->hasVFP2()) return false;
       StrOpc = ARM::VSTRS;
@@ -548,91 +915,162 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg,
       StrOpc = ARM::VSTRD;
       break;
   }
-  
-  if (isThumb)
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(StrOpc), SrcReg)
-                    .addReg(DstReg).addImm(Offset).addReg(0));
-  else
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(StrOpc), SrcReg)
-                    .addReg(DstReg).addReg(0).addImm(Offset));
-  
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT);
+
+  // Create the base instruction, then add the operands.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(StrOpc))
+                            .addReg(SrcReg, getKillRegState(true));
+  AddLoadStoreOperands(VT, Addr, MIB);
   return true;
 }
 
-bool ARMFastISel::ARMSelectStore(const Instruction *I) {
+bool ARMFastISel::SelectStore(const Instruction *I) {
   Value *Op0 = I->getOperand(0);
   unsigned SrcReg = 0;
 
-  // Yay type legalization
-  EVT VT;
+  // Verify we have a legal type before going any further.
+  MVT VT;
   if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
     return false;
 
   // Get the value to be stored into a register.
   SrcReg = getRegForValue(Op0);
-  if (SrcReg == 0)
-    return false;
-    
-  // If we're an alloca we know we have a frame index and can emit the store
-  // quickly.
-  if (ARMStoreAlloca(I, SrcReg))
-    return true;
-    
-  // Our register and offset with innocuous defaults.
-  unsigned Reg = 0;
-  int Offset = 0;
-  
-  // See if we can handle this as Reg + Offset
-  if (!ARMComputeRegOffset(I->getOperand(1), Reg, Offset))
-    return false;
-    
-  if (!ARMEmitStore(VT, SrcReg, Reg, Offset /* 0 */)) return false;
-    
-  return false;
-  
-}
+  if (SrcReg == 0) return false;
 
-bool ARMFastISel::ARMSelectLoad(const Instruction *I) {
-  // If we're an alloca we know we have a frame index and can emit the load
-  // directly in short order.
-  if (ARMLoadAlloca(I))
-    return true;
-    
-  // Verify we have a legal type before going any further.
-  EVT VT;
-  if (!isLoadTypeLegal(I->getType(), VT))
-    return false;
-  
-  // Our register and offset with innocuous defaults.
-  unsigned Reg = 0;
-  int Offset = 0;
-  
-  // See if we can handle this as Reg + Offset
-  if (!ARMComputeRegOffset(I->getOperand(0), Reg, Offset))
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(1), Addr))
     return false;
-  
-  unsigned ResultReg;
-  if (!ARMEmitLoad(VT, ResultReg, Reg, Offset /* 0 */)) return false;
-  
-  UpdateValueMap(I, ResultReg);
+
+  if (!ARMEmitStore(VT, SrcReg, Addr)) return false;
   return true;
 }
 
-bool ARMFastISel::ARMSelectBranch(const Instruction *I) {
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // Needs two compares...
+    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_UEQ:
+    default:
+      // AL is our "false" for now. The other two need more compares.
+      return ARMCC::AL;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::FCMP_OEQ:
+      return ARMCC::EQ;
+    case CmpInst::ICMP_SGT:
+    case CmpInst::FCMP_OGT:
+      return ARMCC::GT;
+    case CmpInst::ICMP_SGE:
+    case CmpInst::FCMP_OGE:
+      return ARMCC::GE;
+    case CmpInst::ICMP_UGT:
+    case CmpInst::FCMP_UGT:
+      return ARMCC::HI;
+    case CmpInst::FCMP_OLT:
+      return ARMCC::MI;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::FCMP_OLE:
+      return ARMCC::LS;
+    case CmpInst::FCMP_ORD:
+      return ARMCC::VC;
+    case CmpInst::FCMP_UNO:
+      return ARMCC::VS;
+    case CmpInst::FCMP_UGE:
+      return ARMCC::PL;
+    case CmpInst::ICMP_SLT:
+    case CmpInst::FCMP_ULT:
+      return ARMCC::LT;
+    case CmpInst::ICMP_SLE:
+    case CmpInst::FCMP_ULE:
+      return ARMCC::LE;
+    case CmpInst::FCMP_UNE:
+    case CmpInst::ICMP_NE:
+      return ARMCC::NE;
+    case CmpInst::ICMP_UGE:
+      return ARMCC::HS;
+    case CmpInst::ICMP_ULT:
+      return ARMCC::LO;
+  }
+}
+
+bool ARMFastISel::SelectBranch(const Instruction *I) {
   const BranchInst *BI = cast<BranchInst>(I);
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
-  
+
   // Simple branch support.
-  unsigned CondReg = getRegForValue(BI->getCondition());
-  if (CondReg == 0) return false;
-  
-  unsigned CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr;
-  unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
+
+  // If we can, avoid recomputing the compare - redoing it could lead to wonky
+  // behavior.
+  // TODO: Factor this out.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+      MVT VT;
+      const Type *Ty = CI->getOperand(0)->getType();
+      if (!isTypeLegal(Ty, VT))
+        return false;
+
+      bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
+      if (isFloat && !Subtarget->hasVFP2())
+        return false;
+
+      unsigned CmpOpc;
+      switch (VT.SimpleTy) {
+        default: return false;
+        // TODO: Verify compares.
+        case MVT::f32:
+          CmpOpc = ARM::VCMPES;
+          break;
+        case MVT::f64:
+          CmpOpc = ARM::VCMPED;
+          break;
+        case MVT::i32:
+          CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr;
+          break;
+      }
+
+      // Get the compare predicate.
+      ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+
+      // We may not handle every CC for now.
+      if (ARMPred == ARMCC::AL) return false;
+
+      unsigned Arg1 = getRegForValue(CI->getOperand(0));
+      if (Arg1 == 0) return false;
+
+      unsigned Arg2 = getRegForValue(CI->getOperand(1));
+      if (Arg2 == 0) return false;
+
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(CmpOpc))
+                      .addReg(Arg1).addReg(Arg2));
+
+      // For floating point we need to move the result to a comparison register
+      // that we can then use for branches.
+      if (isFloat)
+        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(ARM::FMSTAT)));
+
+      unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+      .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
+      FastEmitBranch(FBB, DL);
+      FuncInfo.MBB->addSuccessor(TBB);
+      return true;
+    }
+  }
+
+  unsigned CmpReg = getRegForValue(BI->getCondition());
+  if (CmpReg == 0) return false;
+
+  // Re-set the flags just in case.
+  unsigned CmpOpc = isThumb ? ARM::t2CMPri : ARM::CMPri;
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
-                  .addReg(CondReg).addReg(CondReg));
+                  .addReg(CmpReg).addImm(0));
+
+  unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
                   .addMBB(TBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
   FastEmitBranch(FBB, DL);
@@ -640,18 +1078,809 @@ bool ARMFastISel::ARMSelectBranch(const Instruction *I) {
   return true;
 }
 
+bool ARMFastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  MVT VT;
+  const Type *Ty = CI->getOperand(0)->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  unsigned CmpOpc;
+  unsigned CondReg;
+  switch (VT.SimpleTy) {
+    default: return false;
+    // TODO: Verify compares.
+    case MVT::f32:
+      CmpOpc = ARM::VCMPES;
+      CondReg = ARM::FPSCR;
+      break;
+    case MVT::f64:
+      CmpOpc = ARM::VCMPED;
+      CondReg = ARM::FPSCR;
+      break;
+    case MVT::i32:
+      CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr;
+      CondReg = ARM::CPSR;
+      break;
+  }
+
+  // Get the compare predicate.
+  ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+
+  // We may not handle every CC for now.
+  if (ARMPred == ARMCC::AL) return false;
+
+  unsigned Arg1 = getRegForValue(CI->getOperand(0));
+  if (Arg1 == 0) return false;
+
+  unsigned Arg2 = getRegForValue(CI->getOperand(1));
+  if (Arg2 == 0) return false;
+
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+                  .addReg(Arg1).addReg(Arg2));
+
+  // For floating point we need to move the result to a comparison register
+  // that we can then use for branches.
+  if (isFloat)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::FMSTAT)));
+
+  // Now set a register based on the comparison. Explicitly set the predicates
+  // here.
+  unsigned MovCCOpc = isThumb ? ARM::t2MOVCCi : ARM::MOVCCi;
+  TargetRegisterClass *RC = isThumb ? ARM::rGPRRegisterClass
+                                    : ARM::GPRRegisterClass;
+  unsigned DestReg = createResultReg(RC);
+  Constant *Zero
+    = ConstantInt::get(Type::getInt32Ty(*Context), 0);
+  unsigned ZeroReg = TargetMaterializeConstant(Zero);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), DestReg)
+          .addReg(ZeroReg).addImm(1)
+          .addImm(ARMPred).addReg(CondReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPExt(const Instruction *I) {
+  // Make sure we have VFP and that we're extending float to double.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() ||
+      !V->getType()->isFloatTy()) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(ARM::DPRRegisterClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VCVTDS), Result)
+                  .addReg(Op));
+  UpdateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
+  // Make sure we have VFP and that we're truncating double to float.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!(I->getType()->isFloatTy() &&
+        V->getType()->isDoubleTy())) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(ARM::SPRRegisterClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VCVTSD), Result)
+                  .addReg(Op));
+  UpdateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectSIToFP(const Instruction *I) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  const Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, DstVT))
+    return false;
+
+  unsigned Op = getRegForValue(I->getOperand(0));
+  if (Op == 0) return false;
+
+  // The conversion routine works on fp-reg to fp-reg and the operand above
+  // was an integer, move it to the fp registers if possible.
+  unsigned FP = ARMMoveToFPReg(MVT::f32, Op);
+  if (FP == 0) return false;
+
+  unsigned Opc;
+  if (Ty->isFloatTy()) Opc = ARM::VSITOS;
+  else if (Ty->isDoubleTy()) Opc = ARM::VSITOD;
+  else return 0;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                          ResultReg)
+                  .addReg(FP));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPToSI(const Instruction *I) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  const Type *RetTy = I->getType();
+  if (!isTypeLegal(RetTy, DstVT))
+    return false;
+
+  unsigned Op = getRegForValue(I->getOperand(0));
+  if (Op == 0) return false;
+
+  unsigned Opc;
+  const Type *OpTy = I->getOperand(0)->getType();
+  if (OpTy->isFloatTy()) Opc = ARM::VTOSIZS;
+  else if (OpTy->isDoubleTy()) Opc = ARM::VTOSIZD;
+  else return 0;
+
+  // f64->s32 or f32->s32 both need an intermediate f32 reg.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                          ResultReg)
+                  .addReg(Op));
+
+  // This result needs to be in an integer register, but the conversion only
+  // takes place in fp-regs.
+  unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg);
+  if (IntReg == 0) return false;
+
+  UpdateValueMap(I, IntReg);
+  return true;
+}
+
+bool ARMFastISel::SelectSelect(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  // Things need to be register sized for register moves.
+  if (VT != MVT::i32) return false;
+  const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+
+  unsigned CondReg = getRegForValue(I->getOperand(0));
+  if (CondReg == 0) return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  unsigned Op2Reg = getRegForValue(I->getOperand(2));
+  if (Op2Reg == 0) return false;
+
+  unsigned CmpOpc = isThumb ? ARM::t2TSTri : ARM::TSTri;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+                  .addReg(CondReg).addImm(1));
+  unsigned ResultReg = createResultReg(RC);
+  unsigned MovCCOpc = isThumb ? ARM::t2MOVCCr : ARM::MOVCCr;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
+    .addReg(Op1Reg).addReg(Op2Reg)
+    .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectSDiv(const Instruction *I) {
+  MVT VT;
+  const Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  // If we have integer div support we should have selected this automagically.
+  // In case we have a real miss go ahead and return false and we'll pick
+  // it up later.
+  if (Subtarget->hasDivide()) return false;
+
+  // Otherwise emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = RTLIB::SDIV_I8;
+  else if (VT == MVT::i16)
+    LC = RTLIB::SDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectSRem(const Instruction *I) {
+  MVT VT;
+  const Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = RTLIB::SREM_I8;
+  else if (VT == MVT::i16)
+    LC = RTLIB::SREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT VT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we want to use NEON for our fp
+  // operations, but can't figure out how to. Just use the vfp instructions
+  // if we have them.
+  // FIXME: It'd be nice to use NEON instructions.
+  const Type *Ty = I->getType();
+  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  unsigned Op1 = getRegForValue(I->getOperand(0));
+  if (Op1 == 0) return false;
+
+  unsigned Op2 = getRegForValue(I->getOperand(1));
+  if (Op2 == 0) return false;
+
+  unsigned Opc;
+  bool is64bit = VT == MVT::f64 || VT == MVT::i64;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::FADD:
+      Opc = is64bit ? ARM::VADDD : ARM::VADDS;
+      break;
+    case ISD::FSUB:
+      Opc = is64bit ? ARM::VSUBD : ARM::VSUBS;
+      break;
+    case ISD::FMUL:
+      Opc = is64bit ? ARM::VMULD : ARM::VMULS;
+      break;
+  }
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(Opc), ResultReg)
+                  .addReg(Op1).addReg(Op2));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Call Handling Code
+
+bool ARMFastISel::FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src,
+                                 EVT SrcVT, unsigned &ResultReg) {
+  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+                           Src, /*TODO: Kill=*/false);
+
+  if (RR != 0) {
+    ResultReg = RR;
+    return true;
+  } else
+    return false;
+}
+
+// This is largely taken directly from CCAssignFnForNode - we don't support
+// varargs in FastISel so that part has been removed.
+// TODO: We may not support all of this.
+CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+    // Ignore fastcc. Silence compiler warnings.
+    (void)RetFastCC_ARM_APCS;
+    (void)FastCC_ARM_APCS;
+    // Fallthrough
+  case CallingConv::C:
+    // Use target triple & subtarget features to do actual dispatch.
+    if (Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() &&
+          FloatABIType == FloatABI::Hard)
+        return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+      else
+        return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    } else
+        return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  }
+}
+
+bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TM.getRegisterInfo()->getCallFrameSetupOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(AdjStackDown))
+                  .addImm(NumBytes));
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // We don't handle NEON/vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    // Handle arg promotion, etc.
+    switch (VA.getLocInfo()) {
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt: {
+        bool Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                         Arg, ArgVT, Arg);
+        assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+        Emitted = true;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      case CCValAssign::ZExt: {
+        bool Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                         Arg, ArgVT, Arg);
+        assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+        Emitted = true;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      case CCValAssign::AExt: {
+        bool Emitted = FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+                                         Arg, ArgVT, Arg);
+        if (!Emitted)
+          Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                      Arg, ArgVT, Arg);
+        if (!Emitted)
+          Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                      Arg, ArgVT, Arg);
+
+        assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      case CCValAssign::BCvt: {
+        unsigned BC = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
+                                 /*TODO: Kill=*/false);
+        assert(BC != 0 && "Failed to emit a bitcast!");
+        Arg = BC;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      default: llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              VA.getLocReg())
+      .addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      if (VA.getLocVT() != MVT::f64) return false;
+
+      CCValAssign &NextVA = ArgLocs[++i];
+
+      // TODO: Only handle register args for now.
+      if(!(VA.isRegLoc() && NextVA.isRegLoc())) return false;
+
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(ARM::VMOVRRD), VA.getLocReg())
+                      .addReg(NextVA.getLocReg(), RegState::Define)
+                      .addReg(Arg));
+      RegArgs.push_back(VA.getLocReg());
+      RegArgs.push_back(NextVA.getLocReg());
+    } else {
+      assert(VA.isMemLoc());
+      // Need to store on the stack.
+      Address Addr;
+      Addr.BaseType = Address::RegBase;
+      Addr.Base.Reg = ARM::SP;
+      Addr.Offset = VA.getLocMemOffset();
+
+      if (!ARMEmitStore(ArgVT, Arg, Addr)) return false;
+    }
+  }
+  return true;
+}
+
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(AdjStackUp))
+                  .addImm(NumBytes).addImm(0));
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true));
+
+    // Copy all of the result registers out of their specified physreg.
+    if (RVLocs.size() == 2 && RetVT == MVT::f64) {
+      // For this move we copy into two registers and then move into the
+      // double fp reg we want.
+      EVT DestVT = RVLocs[0].getValVT();
+      TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
+      unsigned ResultReg = createResultReg(DstRC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(ARM::VMOVDRR), ResultReg)
+                      .addReg(RVLocs[0].getLocReg())
+                      .addReg(RVLocs[1].getLocReg()));
+
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[1].getLocReg());
+
+      // Finally update the result.
+      UpdateValueMap(I, ResultReg);
+    } else {
+      assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!");
+      EVT CopyVT = RVLocs[0].getValVT();
+      TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+
+      unsigned ResultReg = createResultReg(DstRC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+
+      // Finally update the result.
+      UpdateValueMap(I, ResultReg);
+    }
+  }
+
+  return true;
+}
+
+bool ARMFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (F.isVarArg())
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
+                  Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext());
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+    // TODO: For now, don't try to handle cases where getLocInfo()
+    // says Full but the types don't match.
+    if (TLI.getValueType(RV->getType()) != VA.getValVT())
+      return false;
+
+    // Make the copy.
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            DstReg).addReg(SrcReg);
+
+    // Mark the register as live out of the function.
+    MRI.addLiveOut(VA.getLocReg());
+  }
+
+  unsigned RetOpc = isThumb ? ARM::tBX_RET : ARM::BX_RET;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(RetOpc)));
+  return true;
+}
+
+// A quick function that will emit a call for a named libcall in F with the
+// vector of passed arguments for the Instruction in I. We can assume that we
+// can emit a call for any libcall we can produce. This is an abridged version
+// of the full call infrastructure since we won't need to worry about things
+// like computed function pointers or strange arguments at call sites.
+// TODO: Try to unify this and the normal call bits for ARM, then try to unify
+// with X86.
+bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
+  CallingConv::ID CC = TLI.getLibcallCallingConv(Call);
+
+  // Handle *simple* calls for now.
+  const Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // For now we're using BLX etc on the assumption that we have v5t ops.
+  if (!Subtarget->hasV5TOps()) return false;
+
+  // TODO: For now if we have long calls specified we don't handle the call.
+  if (EnableARMLongCalls) return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(I->getNumOperands());
+  ArgRegs.reserve(I->getNumOperands());
+  ArgVTs.reserve(I->getNumOperands());
+  ArgFlags.reserve(I->getNumOperands());
+  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+    Value *Op = I->getOperand(i);
+    unsigned Arg = getRegForValue(Op);
+    if (Arg == 0) return false;
+
+    const Type *ArgTy = Op->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT)) return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(Op);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+    return false;
+
+  // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
+  // TODO: Turn this into the table of arm call ops.
+  MachineInstrBuilder MIB;
+  unsigned CallOpc;
+  if(isThumb) {
+    CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi;
+    // Explicitly adding the predicate here.
+    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                         TII.get(CallOpc)))
+                         .addExternalSymbol(TLI.getLibcallName(Call));
+  } else {
+    CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL;
+    // Explicitly adding the predicate here.
+    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                         TII.get(CallOpc))
+          .addExternalSymbol(TLI.getLibcallName(Call)));
+  }
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i]);
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARMFastISel::SelectCall(const Instruction *I) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm or worry about intrinsics yet.
+  if (isa<InlineAsm>(Callee) || isa<IntrinsicInst>(CI)) return false;
+
+  // Only handle global variable Callees that are direct calls.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV || Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()))
+    return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // TODO: Avoid some calling conventions?
+
+  // Let SDISel handle vararg functions.
+  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  if (FTy->isVarArg())
+    return false;
+
+  // Handle *simple* calls for now.
+  const Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // For now we're using BLX etc on the assumption that we have v5t ops.
+  // TODO: Maybe?
+  if (!Subtarget->hasV5TOps()) return false;
+
+  // TODO: For now if we have long calls specified we don't handle the call.
+  if (EnableARMLongCalls) return false;
+  
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(CS.arg_size());
+  ArgRegs.reserve(CS.arg_size());
+  ArgVTs.reserve(CS.arg_size());
+  ArgFlags.reserve(CS.arg_size());
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    unsigned Arg = getRegForValue(*i);
+
+    if (Arg == 0)
+      return false;
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+         // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    const Type *ArgTy = (*i)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT))
+      return false;
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+    return false;
+
+  // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
+  // TODO: Turn this into the table of arm call ops.
+  MachineInstrBuilder MIB;
+  unsigned CallOpc;
+  // Explicitly adding the predicate here.
+  if(isThumb) {
+    CallOpc = Subtarget->isTargetDarwin() ? ARM::tBLXi_r9 : ARM::tBLXi;
+    // Explicitly adding the predicate here.
+    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                         TII.get(CallOpc)))
+          .addGlobalAddress(GV, 0, 0);
+  } else {
+    CallOpc = Subtarget->isTargetDarwin() ? ARM::BLr9 : ARM::BL;
+    // Explicitly adding the predicate here.
+    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                         TII.get(CallOpc))
+          .addGlobalAddress(GV, 0, 0));
+  }
+  
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i]);
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+
+}
+
 // TODO: SoftFP support.
 bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
-  // No Thumb-1 for now.
-  if (isThumb && !AFI->isThumb2Function()) return false;
-  
+
   switch (I->getOpcode()) {
     case Instruction::Load:
-      return ARMSelectLoad(I);
+      return SelectLoad(I);
     case Instruction::Store:
-      return ARMSelectStore(I);
+      return SelectStore(I);
     case Instruction::Br:
-      return ARMSelectBranch(I);
+      return SelectBranch(I);
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return SelectCmp(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectSIToFP(I);
+    case Instruction::FPToSI:
+      return SelectFPToSI(I);
+    case Instruction::FAdd:
+      return SelectBinaryOp(I, ISD::FADD);
+    case Instruction::FSub:
+      return SelectBinaryOp(I, ISD::FSUB);
+    case Instruction::FMul:
+      return SelectBinaryOp(I, ISD::FMUL);
+    case Instruction::SDiv:
+      return SelectSDiv(I);
+    case Instruction::SRem:
+      return SelectSRem(I);
+    case Instruction::Call:
+      return SelectCall(I);
+    case Instruction::Select:
+      return SelectSelect(I);
+    case Instruction::Ret:
+      return SelectRet(I);
     default: break;
   }
   return false;
@@ -659,7 +1888,14 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
 
 namespace llvm {
   llvm::FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo) {
-    if (EnableARMFastISel) return new ARMFastISel(funcInfo);
+    // Completely untested on non-darwin.
+    const TargetMachine &TM = funcInfo.MF->getTarget();
+
+    // Darwin and thumb1 only for now.
+    const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    if (Subtarget->isTargetDarwin() && !Subtarget->isThumb1Only() &&
+        !DisableARMFastISel)
+      return new ARMFastISel(funcInfo);
     return 0;
   }
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/ARMFixupKinds.h
new file mode 100644
index 0000000..3d175e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFixupKinds.h
@@ -0,0 +1,97 @@
+//===-- ARM/ARMFixupKinds.h - ARM Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM_ARMFIXUPKINDS_H
+#define LLVM_ARM_ARMFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM {
+enum Fixups {
+  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
+  // addresses
+  fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
+
+  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
+  // the 16-bit halfwords reordered.
+  fixup_t2_ldst_pcrel_12,
+
+  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where the lower 2 bits are not encoded
+  // (so it's encoded as an 8-bit immediate).
+  fixup_arm_pcrel_10,
+  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_10,
+  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
+  // addresses where the lower 2 bits are not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_thumb_adr_pcrel_10,
+  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_arm_adr_pcrel_12,
+  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_t2_adr_pcrel_12,
+  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
+  // instructions. 
+  fixup_arm_condbranch,
+  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
+  // branch instructions. (unconditional)
+  fixup_arm_uncondbranch,
+  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
+  // uconditional branch instructions.
+  fixup_t2_condbranch,
+  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
+  // branch unconditional branch instructions.
+  fixup_t2_uncondbranch,
+
+  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  fixup_arm_thumb_br,
+
+  // fixup_arm_thumb_blx - Fixup for Thumb BL instructions.
+  fixup_arm_thumb_bl,
+
+  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  fixup_arm_thumb_blx,
+
+  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  fixup_arm_thumb_cb,
+
+  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  fixup_arm_thumb_cp,
+
+  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  fixup_arm_thumb_bcc,
+
+  // The next two are for the movt/movw pair
+  // the 16bit imm field are split into imm{15-12} and imm{11-0}
+  fixup_arm_movt_hi16, // :upper16:
+  fixup_arm_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16, // :upper16:
+  fixup_t2_movw_lo16, // :lower16:
+
+  // It is possible to create an "immediate" that happens to be pcrel.
+  // movw r0, :lower16:Foo-(Bar+8) and movt  r0, :upper16:Foo-(Bar+8)
+  // result in different reloc tags than the above two.
+  // Needed to support ELF::R_ARM_MOVT_PREL and ELF::R_ARM_MOVW_PREL_NC
+  fixup_arm_movt_hi16_pcrel, // :upper16:
+  fixup_arm_movw_lo16_pcrel, // :lower16:
+  fixup_t2_movt_hi16_pcrel, // :upper16:
+  fixup_t2_movw_lo16_pcrel, // :lower16:
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h b/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h
deleted file mode 100644
index d5dae24..0000000
--- a/contrib/llvm/lib/Target/ARM/ARMFrameInfo.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- ARMTargetFrameInfo.h - Define TargetFrameInfo for ARM ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_FRAMEINFO_H
-#define ARM_FRAMEINFO_H
-
-#include "ARM.h"
-#include "ARMSubtarget.h"
-#include "llvm/Target/TargetFrameInfo.h"
-
-namespace llvm {
-
-class ARMFrameInfo : public TargetFrameInfo {
-public:
-  explicit ARMFrameInfo(const ARMSubtarget &ST)
-    : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0, 4) {
-  }
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
new file mode 100644
index 0000000..f42c6db
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -0,0 +1,1021 @@
+//=======- ARMFrameLowering.cpp - ARM Frame Information --------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFrameLowering.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+
+  // Mac OS X requires FP not to be clobbered for backtracing purpose.
+  if (STI.isTargetDarwin())
+    return true;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Always eliminate non-leaf frame pointers.
+  return ((DisableFramePointerElim(MF) && MFI->hasCalls()) ||
+          RegInfo->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Unlike most targets, having a FP
+/// is not sufficient here since we still may reference some objects via SP
+/// even when FP is available in Thumb2 mode.
+bool
+ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI,
+                        const ARMBaseInstrInfo &TII,
+                        const unsigned *CSRegs) {
+  // Integer spill area is handled with "pop".
+  if (MI->getOpcode() == ARM::LDMIA_RET ||
+      MI->getOpcode() == ARM::t2LDMIA_RET ||
+      MI->getOpcode() == ARM::LDMIA_UPD ||
+      MI->getOpcode() == ARM::t2LDMIA_UPD ||
+      MI->getOpcode() == ARM::VLDMDIA_UPD) {
+    // The first two operands are predicates. The last two are
+    // imp-def and imp-use of SP. Check everything in between.
+    for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
+      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+        return false;
+    return true;
+  }
+  if ((MI->getOpcode() == ARM::LDR_POST ||
+       MI->getOpcode() == ARM::t2LDR_POST) &&
+      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) &&
+      MI->getOperand(1).getReg() == ARM::SP)
+    return true;
+
+  return false;
+}
+
+static void
+emitSPUpdate(bool isARM,
+             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+             DebugLoc dl, const ARMBaseInstrInfo &TII,
+             int NumBytes,
+             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+  if (isARM)
+    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                            Pred, PredReg, TII);
+  else
+    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                           Pred, PredReg, TII);
+}
+
+void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitPrologue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  // Allocate the vararg register save area. This is not counted in NumBytes.
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetDarwin()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  // Move past area 1.
+  if (GPRCS1Size > 0) MBBI++;
+
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For Darwin, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not Darwin, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it is
+  // now safe to emit this assignment.
+  bool HasFP = hasFP(MF);
+  if (HasFP) {
+    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0);
+    AddDefaultCC(AddDefaultPred(MIB));
+  }
+
+  // Move past area 2.
+  if (GPRCS2Size > 0) MBBI++;
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  if (HasFP)
+    AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+  // Move past area 3.
+  if (DPRCSSize > 0) MBBI++;
+
+  NumBytes = DPRCSOffset;
+  if (NumBytes) {
+    // Adjust SP after all the callee-save spills.
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+    if (HasFP && isARM)
+      // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
+      // Note it's not safe to do this in Thumb2 mode because it would have
+      // taken two instructions:
+      // mov sp, r7
+      // sub sp, #24
+      // If an interrupt is taken between the two instructions, then sp is in
+      // an inconsistent state (pointing to the middle of callee-saved area).
+      // The interrupt handler can end up clobbering the registers.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  if (STI.isTargetELF() && hasFP(MF))
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // If we need dynamic stack realignment, do it here. Be paranoid and make
+  // sure if we also have VLAs, we have a base pointer for frame access.
+  if (RegInfo->needsStackRealignment(MF)) {
+    unsigned MaxAlign = MFI->getMaxAlignment();
+    assert (!AFI->isThumb1OnlyFunction());
+    if (!AFI->isThumbFunction()) {
+      // Emit bic sp, sp, MaxAlign
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::BICri), ARM::SP)
+                                  .addReg(ARM::SP, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+    } else {
+      // We cannot use sp as source/dest register here, thus we're emitting the
+      // following sequence:
+      // mov r4, sp
+      // bic r4, r4, MaxAlign
+      // mov sp, r4
+      // FIXME: It will be better just to find spare register here.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2tgpr), ARM::R4)
+        .addReg(ARM::SP, RegState::Kill);
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::t2BICri), ARM::R4)
+                                  .addReg(ARM::R4, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
+        .addReg(ARM::R4, RegState::Kill);
+    }
+
+    AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF)) {
+    if (isARM)
+      BuildMI(MBB, MBBI, dl,
+              TII.get(ARM::MOVr), RegInfo->getBaseRegister())
+        .addReg(ARM::SP)
+        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+    else
+      BuildMI(MBB, MBBI, dl,
+              TII.get(ARM::tMOVgpr2gpr), RegInfo->getBaseRegister())
+        .addReg(ARM::SP);
+  }
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI->hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->getDesc().isReturn() &&
+         "Can only insert epilog into returning blocks");
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitEpilogue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    const unsigned *CSRegs = RegInfo->getCalleeSavedRegs();
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      if (!isCSRestore(MBBI, TII, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    // Reset SP based on frame pointer only if the stack frame extends beyond
+    // frame pointer stack slot or target is ELF and the function has FP.
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      if (NumBytes) {
+        if (isARM)
+          emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                  ARMCC::AL, 0, TII);
+        else {
+          // It's not possible to restore SP from FP in a single instruction.
+          // For Darwin, this looks like:
+          // mov sp, r7
+          // sub sp, #24
+          // This is bad, if an interrupt is taken after the mov, sp is in an
+          // inconsistent state.
+          // Use the first callee-saved register as a scratch register.
+          assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+                 "No scratch register to restore SP from FP!");
+          emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                 ARMCC::AL, 0, TII);
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
+            .addReg(ARM::R4);
+        }
+      } else {
+        // Thumb2 or ARM.
+        if (isARM)
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+        else
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
+            .addReg(FramePtr);
+      }
+    } else if (NumBytes)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+    // Increment past our save areas.
+    if (AFI->getDPRCalleeSavedAreaSize()) MBBI++;
+    if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
+    if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
+  }
+
+  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND ||
+      RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) {
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+    // Jump to label or value in register.
+    if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND) {
+      unsigned TCOpcode = (RetOpcode == ARM::TCRETURNdi)
+        ? (STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)
+        : (STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND);
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+      if (JumpTarget.isGlobal())
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      else {
+        assert(JumpTarget.isSymbol());
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+    } else if (RetOpcode == ARM::TCRETURNri) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    } else if (RetOpcode == ARM::TCRETURNriND) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    MachineInstr *NewMI = prior(MBBI);
+    for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+      NewMI->addOperand(MBBI->getOperand(i));
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+  }
+
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int
+ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                         unsigned &FrameReg) const {
+  return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+}
+
+int
+ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
+                                             int FI,
+                                             unsigned &FrameReg,
+                                             int SPAdj) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  int FPOffset = Offset - AFI->getFramePtrSpillOffset();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  FrameReg = ARM::SP;
+  Offset += SPAdj;
+  if (AFI->isGPRCalleeSavedArea1Frame(FI))
+    return Offset - AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FI))
+    return Offset - AFI->getGPRCalleeSavedArea2Offset();
+  else if (AFI->isDPRCalleeSavedAreaFrame(FI))
+    return Offset - AFI->getDPRCalleeSavedAreaOffset();
+
+  // When dynamically realigning the stack, use the frame pointer for
+  // parameters, and the stack/base pointer for locals.
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+    if (isFixed) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      Offset = FPOffset;
+    } else if (MFI->hasVarSizedObjects()) {
+      assert(RegInfo->hasBasePointer(MF) &&
+             "VLAs and dynamic stack alignment, but missing base pointer!");
+      FrameReg = RegInfo->getBaseRegister();
+    }
+    return Offset;
+  }
+
+  // If there is a frame pointer, use it when we can.
+  if (hasFP(MF) && AFI->hasStackFrame()) {
+    // Use frame pointer to reference fixed objects. Use it for locals if
+    // there are VLAs (and thus the SP isn't reliable as a base).
+    if (isFixed || (MFI->hasVarSizedObjects() &&
+                    !RegInfo->hasBasePointer(MF))) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    } else if (MFI->hasVarSizedObjects()) {
+      assert(RegInfo->hasBasePointer(MF) && "missing base pointer!");
+      // Try to use the frame pointer if we can, else use the base pointer
+      // since it's available. This is handy for the emergency spill slot, in
+      // particular.
+      if (AFI->isThumb2Function()) {
+        if (FPOffset >= -255 && FPOffset < 0) {
+          FrameReg = RegInfo->getFrameRegister(MF);
+          return FPOffset;
+        }
+      } else
+        FrameReg = RegInfo->getBaseRegister();
+    } else if (AFI->isThumb2Function()) {
+      // In Thumb2 mode, the negative offset is very limited. Try to avoid
+      // out of range references.
+      if (FPOffset >= -255 && FPOffset < 0) {
+        FrameReg = RegInfo->getFrameRegister(MF);
+        return FPOffset;
+      }
+    } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) {
+      // Otherwise, use SP or FP, whichever is closer to the stack slot.
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    }
+  }
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  return Offset;
+}
+
+int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                          int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    unsigned StmOpc, unsigned StrOpc,
+                                    bool NoGap,
+                                    bool(*Func)(unsigned, bool)) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  SmallVector<std::pair<unsigned,bool>, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+
+      // Add the callee-saved register as live-in unless it's LR and
+      // @llvm.returnaddress is called. If LR is returned for
+      // @llvm.returnaddress then it's already added to the function and
+      // entry block live-in sets.
+      bool isKill = true;
+      if (Reg == ARM::LR) {
+        if (MF.getFrameInfo()->isReturnAddressTaken() &&
+            MF.getRegInfo().isLiveIn(Reg))
+          isKill = false;
+      }
+
+      if (isKill)
+        MBB.addLiveIn(Reg);
+
+      // If NoGap is true, push consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+      LastReg = Reg;
+      Regs.push_back(std::make_pair(Reg, isKill));
+    }
+
+    if (Regs.empty())
+      continue;
+    if (Regs.size() > 1 || StrOpc== 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
+                       .addReg(ARM::SP));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
+    } else if (Regs.size() == 1) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
+                                        ARM::SP)
+        .addReg(Regs[0].first, getKillRegState(Regs[0].second))
+        .addReg(ARM::SP);
+      // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
+      // that refactoring is complete (eventually).
+      if (StrOpc == ARM::STR_PRE) {
+        MIB.addReg(0);
+        MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::sub, 4, ARM_AM::no_shift));
+      } else
+        MIB.addImm(-4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+  }
+}
+
+void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   unsigned LdmOpc, unsigned LdrOpc,
+                                   bool isVarArg, bool NoGap,
+                                   bool(*Func)(unsigned, bool)) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned RetOpcode = MI->getOpcode();
+  bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
+                     RetOpcode == ARM::TCRETURNdiND ||
+                     RetOpcode == ARM::TCRETURNri ||
+                     RetOpcode == ARM::TCRETURNriND);
+
+  SmallVector<unsigned, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    bool DeleteRet = false;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
+
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) {
+        Reg = ARM::PC;
+        LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+        // Fold the return instruction into the LDM.
+        DeleteRet = true;
+      }
+
+      // If NoGap is true, pop consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+
+      LastReg = Reg;
+      Regs.push_back(Reg);
+    }
+
+    if (Regs.empty())
+      continue;
+    if (Regs.size() > 1 || LdrOpc == 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
+                       .addReg(ARM::SP));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i], getDefRegState(true));
+      if (DeleteRet)
+        MI->eraseFromParent();
+      MI = MIB;
+    } else if (Regs.size() == 1) {
+      // If we adjusted the reg to PC from LR above, switch it back here. We
+      // only do that for LDM.
+      if (Regs[0] == ARM::PC)
+        Regs[0] = ARM::LR;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0])
+          .addReg(ARM::SP, RegState::Define)
+          .addReg(ARM::SP);
+      // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
+      // that refactoring is complete (eventually).
+      if (LdrOpc == ARM::LDR_POST) {
+        MIB.addReg(0);
+        MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
+      } else
+        MIB.addImm(4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+  }
+}
+
+bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
+  unsigned PushOneOpc = AFI->isThumbFunction() ? ARM::t2STR_PRE : ARM::STR_PRE;
+  unsigned FltOpc = ARM::VSTMDDB_UPD;
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register);
+  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register);
+
+  return true;
+}
+
+bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
+
+  unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST : ARM::LDR_POST;
+  unsigned FltOpc = ARM::VLDMDIA_UPD;
+  emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea2Register);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea1Register);
+
+  return true;
+}
+
+// FIXME: Make generic?
+static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
+                                       const ARMBaseInstrInfo &TII) {
+  unsigned FnSize = 0;
+  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
+         I != E; ++I)
+      FnSize += TII.GetInstSizeInBytes(I);
+  }
+  return FnSize;
+}
+
+/// estimateStackSize - Estimate and return the size of the frame.
+/// FIXME: Make generic?
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+  }
+  return (unsigned)Offset;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require a scratch register during their expansion later.
+// FIXME: Move to TII?
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
+                                         const TargetFrameLowering *TFI) {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned Limit = (1 << 12) - 1;
+  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!I->getOperand(i).isFI()) continue;
+
+        // When using ADDri to get the address of a stack object, 255 is the
+        // largest offset guaranteed to fit in the immediate offset.
+        if (I->getOpcode() == ARM::ADDri) {
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        }
+
+        // Otherwise check the addressing mode.
+        switch (I->getDesc().TSFlags & ARMII::AddrModeMask) {
+        case ARMII::AddrMode3:
+        case ARMII::AddrModeT2_i8:
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode5:
+        case ARMII::AddrModeT2_i8s4:
+          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+          break;
+        case ARMII::AddrModeT2_i12:
+          // i12 supports only positive offset so these will be converted to
+          // i8 opcodes. See llvm::rewriteT2FrameIndex.
+          if (TFI->hasFP(MF) && AFI->hasStackFrame())
+            Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode4:
+        case ARMII::AddrMode6:
+          // Addressing modes 4 & 6 (load/store) instructions can't encode an
+          // immediate offset for stack references.
+          return 0;
+        default:
+          break;
+        }
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+
+void
+ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
+  // scratch register. Also spill R4 if Thumb2 function has varsized objects,
+  // since it's not always possible to restore sp from fp in a single
+  // instruction.
+  // FIXME: It will be better just to find spare register here.
+  if (AFI->isThumb2Function() &&
+      (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
+    MF.getRegInfo().setPhysRegUsed(ARM::R4);
+
+  if (AFI->isThumb1OnlyFunction()) {
+    // Spill LR if Thumb1 function uses variable length argument lists.
+    if (AFI->getVarArgsRegSaveSize() > 0)
+      MF.getRegInfo().setPhysRegUsed(ARM::LR);
+
+    // Spill R4 if Thumb1 epilogue has to restore SP from FP since 
+    // FIXME: It will be better just to find spare register here.
+    if (MFI->hasVarSizedObjects())
+      MF.getRegInfo().setPhysRegUsed(ARM::R4);
+  }
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is used.
+  const unsigned *CSRegs = RegInfo->getCalleeSavedRegs();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (MF.getRegInfo().isPhysRegUsed(Reg)) {
+      Spilled = true;
+      CanEliminateFrame = false;
+    } else {
+      // Check alias registers too.
+      for (const unsigned *Aliases =
+             RegInfo->getAliasSet(Reg); *Aliases; ++Aliases) {
+        if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
+          Spilled = true;
+          CanEliminateFrame = false;
+        }
+      }
+    }
+
+    if (!ARM::GPRRegisterClass->contains(Reg))
+      continue;
+
+    if (Spilled) {
+      NumGPRSpills++;
+
+      if (!STI.isTargetDarwin()) {
+        if (Reg == ARM::LR)
+          LRSpilled = true;
+        CS1Spilled = true;
+        continue;
+      }
+
+      // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+      switch (Reg) {
+      case ARM::LR:
+        LRSpilled = true;
+        // Fallthrough
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+        CS1Spilled = true;
+        break;
+      default:
+        break;
+      }
+    } else {
+      if (!STI.isTargetDarwin()) {
+        UnspilledCS1GPRs.push_back(Reg);
+        continue;
+      }
+
+      switch (Reg) {
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        UnspilledCS1GPRs.push_back(Reg);
+        break;
+      default:
+        UnspilledCS2GPRs.push_back(Reg);
+        break;
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+    unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  // If any of the stack slot references may be out of range of an immediate
+  // offset, make sure a register (or a spill slot) is available for the
+  // register scavenger. Note that if we're indexing off the frame pointer, the
+  // effective stack size is 4 bytes larger since the FP points to the stack
+  // slot of the previous FP. Also, if we have variable sized objects in the
+  // function, stack slot references will often be negative, and some of
+  // our instructions are positive-offset only, so conservatively consider
+  // that case to want a spill slot (or register) as well. Similarly, if
+  // the function adjusts the stack pointer during execution and the
+  // adjustments aren't already part of our stack size estimate, our offset
+  // calculations may be off, so be conservative.
+  // FIXME: We could add logic to be more precise about negative offsets
+  //        and which instructions will need a scratch register for them. Is it
+  //        worth the effort and added fragility?
+  bool BigStack =
+    (RS &&
+     (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
+      estimateRSStackSizeLimit(MF, this)))
+    || MFI->hasVarSizedObjects()
+    || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
+
+  bool ExtraCSSpill = false;
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
+    AFI->setHasStackFrame(true);
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      MF.getRegInfo().setPhysRegUsed(ARM::LR);
+      NumGPRSpills++;
+      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
+                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    if (hasFP(MF)) {
+      MF.getRegInfo().setPhysRegUsed(FramePtr);
+      NumGPRSpills++;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs, spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    unsigned TargetAlign = getStackAlignment();
+    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spill high register if the function is thumb1
+          if (!AFI->isThumb1OnlyFunction() ||
+              isARMLowRegister(Reg) || Reg == ARM::LR) {
+            MF.getRegInfo().setPhysRegUsed(Reg);
+            if (!RegInfo->isReservedReg(MF, Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        MF.getRegInfo().setPhysRegUsed(Reg);
+        if (!RegInfo->isReservedReg(MF, Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additional
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging. Thumb1 needs a spill slot for stack pointer
+    // adjustments also, even when the frame itself is small.
+    if (BigStack && !ExtraCSSpill) {
+      // If any non-reserved CS register isn't spilled, just spill one or two
+      // extra. That should take care of it!
+      unsigned NumExtras = TargetAlign / 4;
+      SmallVector<unsigned, 2> Extras;
+      while (NumExtras && !UnspilledCS1GPRs.empty()) {
+        unsigned Reg = UnspilledCS1GPRs.back();
+        UnspilledCS1GPRs.pop_back();
+        if (!RegInfo->isReservedReg(MF, Reg) &&
+            (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
+             Reg == ARM::LR)) {
+          Extras.push_back(Reg);
+          NumExtras--;
+        }
+      }
+      // For non-Thumb1 functions, also check for hi-reg CS registers
+      if (!AFI->isThumb1OnlyFunction()) {
+        while (NumExtras && !UnspilledCS2GPRs.empty()) {
+          unsigned Reg = UnspilledCS2GPRs.back();
+          UnspilledCS2GPRs.pop_back();
+          if (!RegInfo->isReservedReg(MF, Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+      }
+      if (Extras.size() && NumExtras == 0) {
+        for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+          MF.getRegInfo().setPhysRegUsed(Extras[i]);
+        }
+      } else if (!AFI->isThumb1OnlyFunction()) {
+        // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
+        // closest to SP or frame pointer.
+        const TargetRegisterClass *RC = ARM::GPRRegisterClass;
+        RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                           RC->getAlignment(),
+                                                           false));
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    MF.getRegInfo().setPhysRegUsed(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
new file mode 100644
index 0000000..1288b70
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -0,0 +1,74 @@
+//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_FRAMEINFO_H
+#define ARM_FRAMEINFO_H
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMFrameLowering : public TargetFrameLowering {
+protected:
+  const ARMSubtarget &STI;
+
+public:
+  explicit ARMFrameLowering(const ARMSubtarget &sti)
+    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+      STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
+  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg, int SPAdj) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+
+ private:
+  void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
+                    unsigned StrOpc, bool NoGap,
+                    bool(*Func)(unsigned, bool)) const;
+  void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
+                   unsigned LdrOpc, bool isVarArg, bool NoGap,
+                   bool(*Func)(unsigned, bool)) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp b/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp
index 85b0c6c..3f02383 100644
--- a/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp
@@ -12,7 +12,8 @@
 // global). Such a transformation can significantly reduce the register pressure
 // when many globals are involved.
 //
-// For example, consider the code which touches several global variables at once:
+// For example, consider the code which touches several global variables at 
+// once:
 //
 // static int foo[N], bar[N], baz[N];
 //
@@ -48,7 +49,7 @@
 //  str     r0, [r5], #4
 //
 //  note that we saved 2 registers here almostly "for free".
-// ===----------------------------------------------------------------------===//
+// ===---------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "arm-global-merge"
 #include "ARM.h"
@@ -64,16 +65,17 @@
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
 namespace {
-  class LLVM_LIBRARY_VISIBILITY ARMGlobalMerge : public FunctionPass {
+  class ARMGlobalMerge : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// target type sizes.
     const TargetLowering *TLI;
 
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
-                 Module &M, bool) const;
+                 Module &M, bool isConst) const;
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
@@ -81,7 +83,7 @@ namespace {
       : FunctionPass(ID), TLI(tli) {}
 
     virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function& F);
+    virtual bool runOnFunction(Function &F);
 
     const char *getPassName() const {
       return "Merge internal globals";
@@ -95,13 +97,11 @@ namespace {
     struct GlobalCmp {
       const TargetData *TD;
 
-      GlobalCmp(const TargetData *td):
-        TD(td) { }
+      GlobalCmp(const TargetData *td) : TD(td) { }
 
-      bool operator() (const GlobalVariable* GV1,
-                       const GlobalVariable* GV2) {
-        const Type* Ty1 = cast<PointerType>(GV1->getType())->getElementType();
-        const Type* Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+      bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
+        const Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        const Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
 
         return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
       }
@@ -130,27 +130,27 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
     uint64_t MergedSize = 0;
     std::vector<const Type*> Tys;
     std::vector<Constant*> Inits;
-    for (j = i; MergedSize < MaxOffset && j != e; ++j) {
-      const Type* Ty = Globals[j]->getType()->getElementType();
+    for (j = i; j != e; ++j) {
+      const Type *Ty = Globals[j]->getType()->getElementType();
+      MergedSize += TD->getTypeAllocSize(Ty);
+      if (MergedSize > MaxOffset) {
+        break;
+      }
       Tys.push_back(Ty);
       Inits.push_back(Globals[j]->getInitializer());
-      MergedSize += TD->getTypeAllocSize(Ty);
     }
 
-    StructType* MergedTy = StructType::get(M.getContext(), Tys);
-    Constant* MergedInit = ConstantStruct::get(MergedTy, Inits);
-    GlobalVariable* MergedGV = new GlobalVariable(M, MergedTy, isConst,
+    StructType *MergedTy = StructType::get(M.getContext(), Tys);
+    Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
+    GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
                                                   GlobalValue::InternalLinkage,
-                                                  MergedInit, "merged");
+                                                  MergedInit, "_MergedGlobals");
     for (size_t k = i; k < j; ++k) {
-      SmallVector<Constant*, 2> Idx;
-      Idx.push_back(ConstantInt::get(Int32Ty, 0));
-      Idx.push_back(ConstantInt::get(Int32Ty, k-i));
-
-      Constant* GEP =
-        ConstantExpr::getInBoundsGetElementPtr(MergedGV,
-                                               &Idx[0], Idx.size());
-
+      Constant *Idx[2] = {
+        ConstantInt::get(Int32Ty, 0),
+        ConstantInt::get(Int32Ty, k-i)
+      };
+      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx, 2);
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
     }
@@ -161,8 +161,8 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
 }
 
 
-bool ARMGlobalMerge::doInitialization(Module& M) {
-  SmallVector<GlobalVariable*, 16> Globals, ConstGlobals;
+bool ARMGlobalMerge::doInitialization(Module &M) {
+  SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals;
   const TargetData *TD = TLI->getTargetData();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
@@ -183,8 +183,11 @@ bool ARMGlobalMerge::doInitialization(Module& M) {
         I->getName().startswith(".llvm."))
       continue;
 
-    if (TD->getTypeAllocSize(I->getType()) < MaxOffset) {
-      if (I->isConstant())
+    if (TD->getTypeAllocSize(I->getType()->getElementType()) < MaxOffset) {
+      const TargetLoweringObjectFile &TLOF = TLI->getObjFileLowering();
+      if (TLOF.getKindForGlobal(I, TLI->getTargetMachine()).isBSSLocal())
+        BSSGlobals.push_back(I);
+      else if (I->isConstant())
         ConstGlobals.push_back(I);
       else
         Globals.push_back(I);
@@ -193,17 +196,19 @@ bool ARMGlobalMerge::doInitialization(Module& M) {
 
   if (Globals.size() > 1)
     Changed |= doMerge(Globals, M, false);
+  if (BSSGlobals.size() > 1)
+    Changed |= doMerge(BSSGlobals, M, false);
+
   // FIXME: This currently breaks the EH processing due to way how the 
   // typeinfo detection works. We might want to detect the TIs and ignore 
   // them in the future.
-  
   // if (ConstGlobals.size() > 1)
   //  Changed |= doMerge(ConstGlobals, M, true);
 
   return Changed;
 }
 
-bool ARMGlobalMerge::runOnFunction(Function& F) {
+bool ARMGlobalMerge::runOnFunction(Function &F) {
   return false;
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
new file mode 100644
index 0000000..676b01e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -0,0 +1,121 @@
+//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMHazardRecognizer.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
+                         const TargetRegisterInfo &TRI) {
+  // FIXME: Detect integer instructions properly.
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Domain = TID.TSFlags & ARMII::DomainMask;
+  if (Domain == ARMII::DomainVFP) {
+    unsigned Opcode = MI->getOpcode();
+    if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
+        Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+      return false;
+  } else if (Domain == ARMII::DomainNEON) {
+    if (MI->getDesc().mayStore() || MI->getDesc().mayLoad())
+      return false;
+  } else
+    return false;
+  return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead");
+
+  MachineInstr *MI = SU->getInstr();
+
+  if (!MI->isDebugValue()) {
+    if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1])
+      return Hazard;
+
+    // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
+    // a VMLA / VMLS will cause 4 cycle stall.
+    const TargetInstrDesc &TID = MI->getDesc();
+    if (LastMI && (TID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
+      MachineInstr *DefMI = LastMI;
+      const TargetInstrDesc &LastTID = LastMI->getDesc();
+      // Skip over one non-VFP / NEON instruction.
+      if (!LastTID.isBarrier() &&
+          (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
+        MachineBasicBlock::iterator I = LastMI;
+        if (I != LastMI->getParent()->begin()) {
+          I = llvm::prior(I);
+          DefMI = &*I;
+        }
+      }
+
+      if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
+          (TII.canCauseFpMLxStall(MI->getOpcode()) ||
+           hasRAWHazard(DefMI, MI, TRI))) {
+        // Try to schedule another instruction for the next 4 cycles.
+        if (FpMLxStalls == 0)
+          FpMLxStalls = 4;
+        return Hazard;
+      }
+    }
+  }
+
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+void ARMHazardRecognizer::Reset() {
+  LastMI = 0;
+  FpMLxStalls = 0;
+  ITBlockSize = 0;
+  ScoreboardHazardRecognizer::Reset();
+}
+
+void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  unsigned Opcode = MI->getOpcode();
+  if (ITBlockSize) {
+    --ITBlockSize;
+  } else if (Opcode == ARM::t2IT) {
+    unsigned Mask = MI->getOperand(1).getImm();
+    unsigned NumTZ = CountTrailingZeros_32(Mask);
+    assert(NumTZ <= 3 && "Invalid IT mask!");
+    ITBlockSize = 4 - NumTZ;
+    MachineBasicBlock::iterator I = MI;
+    for (unsigned i = 0; i < ITBlockSize; ++i) {
+      // Advance to the next instruction, skipping any dbg_value instructions.
+      do {
+        ++I;
+      } while (I->isDebugValue());
+      ITBlockMIs[ITBlockSize-1-i] = &*I;
+    }
+  }
+
+  if (!MI->isDebugValue()) {
+    LastMI = MI;
+    FpMLxStalls = 0;
+  }
+
+  ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void ARMHazardRecognizer::AdvanceCycle() {
+  if (FpMLxStalls && --FpMLxStalls == 0)
+    // Stalled for 4 cycles but still can't schedule any other instructions.
+    LastMI = 0;
+  ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void ARMHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("reverse ARM hazard checking unsupported");
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
new file mode 100644
index 0000000..2bc218d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -0,0 +1,54 @@
+//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling ARM functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMHAZARDRECOGNIZER_H
+#define ARMHAZARDRECOGNIZER_H
+
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+
+namespace llvm {
+
+class ARMBaseInstrInfo;
+class ARMBaseRegisterInfo;
+class ARMSubtarget;
+class MachineInstr;
+
+class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
+  const ARMBaseInstrInfo &TII;
+  const ARMBaseRegisterInfo &TRI;
+  const ARMSubtarget &STI;
+
+  MachineInstr *LastMI;
+  unsigned FpMLxStalls;
+  unsigned ITBlockSize;  // No. of MIs in current IT block yet to be scheduled.
+  MachineInstr *ITBlockMIs[4];
+
+public:
+  ARMHazardRecognizer(const InstrItineraryData *ItinData,
+                      const ARMBaseInstrInfo &tii,
+                      const ARMBaseRegisterInfo &tri,
+                      const ARMSubtarget &sti,
+                      const ScheduleDAG *DAG) :
+    ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii),
+    TRI(tri), STI(sti), LastMI(0), ITBlockSize(0) {}
+
+  virtual HazardType getHazardType(SUnit *SU, int Stalls);
+  virtual void Reset();
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void RecedeCycle();
+};
+
+} // end namespace llvm
+
+#endif // ARMHAZARDRECOGNIZER_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 51a30c1..a506cff 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMAddressingModes.h"
 #include "ARMTargetMachine.h"
 #include "llvm/CallingConv.h"
@@ -41,13 +42,25 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
   cl::desc("Disable isel of shifter-op"),
   cl::init(false));
 
+static cl::opt<bool>
+CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
+  cl::desc("Check fp vmla / vmls hazard at isel time"),
+  cl::init(false));
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
 ///
 namespace {
+
+enum AddrMode2Type {
+  AM2_BASE, // Simple AM2 (+-imm12)
+  AM2_SHOP  // Shifter-op AM2
+};
+
 class ARMDAGToDAGISel : public SelectionDAGISel {
   ARMBaseTargetMachine &TM;
+  const ARMBaseInstrInfo *TII;
 
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -57,7 +70,8 @@ public:
   explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
                            CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel), TM(tm),
-    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
+      TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())),
+      Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
   virtual const char *getPassName() const {
@@ -72,60 +86,101 @@ public:
 
   SDNode *Select(SDNode *N);
 
-  bool SelectShifterOperandReg(SDNode *Op, SDValue N, SDValue &A,
+
+  bool hasNoVMLxHazardUse(SDNode *N) const;
+  bool isShifterOpProfitable(const SDValue &Shift,
+                             ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
+  bool SelectShifterOperandReg(SDValue N, SDValue &A,
                                SDValue &B, SDValue &C);
-  bool SelectAddrMode2(SDNode *Op, SDValue N, SDValue &Base,
-                       SDValue &Offset, SDValue &Opc);
+  bool SelectShiftShifterOperandReg(SDValue N, SDValue &A,
+                                    SDValue &B, SDValue &C);
+  bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
+
+  AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base,
+                                      SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE;
+  }
+
+  bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP;
+  }
+
+  bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset,
+                       SDValue &Opc) {
+    SelectAddrMode2Worker(N, Base, Offset, Opc);
+//    return SelectAddrMode2ShOp(N, Base, Offset, Opc);
+    // This always matches one way or another.
+    return true;
+  }
+
   bool SelectAddrMode2Offset(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
-  bool SelectAddrMode3(SDNode *Op, SDValue N, SDValue &Base,
+  bool SelectAddrMode3(SDValue N, SDValue &Base,
                        SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
-  bool SelectAddrMode4(SDNode *Op, SDValue N, SDValue &Addr,
-                       SDValue &Mode);
-  bool SelectAddrMode5(SDNode *Op, SDValue N, SDValue &Base,
+  bool SelectAddrMode5(SDValue N, SDValue &Base,
                        SDValue &Offset);
-  bool SelectAddrMode6(SDNode *Op, SDValue N, SDValue &Addr, SDValue &Align);
-
-  bool SelectAddrModePC(SDNode *Op, SDValue N, SDValue &Offset,
-                        SDValue &Label);
-
-  bool SelectThumbAddrModeRR(SDNode *Op, SDValue N, SDValue &Base,
-                             SDValue &Offset);
-  bool SelectThumbAddrModeRI5(SDNode *Op, SDValue N, unsigned Scale,
-                              SDValue &Base, SDValue &OffImm,
-                              SDValue &Offset);
-  bool SelectThumbAddrModeS1(SDNode *Op, SDValue N, SDValue &Base,
-                             SDValue &OffImm, SDValue &Offset);
-  bool SelectThumbAddrModeS2(SDNode *Op, SDValue N, SDValue &Base,
-                             SDValue &OffImm, SDValue &Offset);
-  bool SelectThumbAddrModeS4(SDNode *Op, SDValue N, SDValue &Base,
-                             SDValue &OffImm, SDValue &Offset);
-  bool SelectThumbAddrModeSP(SDNode *Op, SDValue N, SDValue &Base,
-                             SDValue &OffImm);
-
-  bool SelectT2ShifterOperandReg(SDNode *Op, SDValue N,
+  bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
+
+  bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label);
+
+  // Thumb Addressing Modes:
+  bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset,
+                             unsigned Scale);
+  bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
+                                SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+
+  // Thumb 2 Addressing Modes:
+  bool SelectT2ShifterOperandReg(SDValue N,
                                  SDValue &BaseReg, SDValue &Opc);
-  bool SelectT2AddrModeImm12(SDNode *Op, SDValue N, SDValue &Base,
-                             SDValue &OffImm);
-  bool SelectT2AddrModeImm8(SDNode *Op, SDValue N, SDValue &Base,
+  bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
                             SDValue &OffImm);
   bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
                                  SDValue &OffImm);
-  bool SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, SDValue &Base,
-                              SDValue &OffImm);
-  bool SelectT2AddrModeSoReg(SDNode *Op, SDValue N, SDValue &Base,
+  bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
                              SDValue &OffReg, SDValue &ShImm);
 
+  inline bool is_so_imm(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }
+
+  inline bool is_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(~Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(~Imm) != -1;
+  }
+
   inline bool Pred_so_imm(SDNode *inN) const {
     ConstantSDNode *N = cast<ConstantSDNode>(inN);
-    return ARM_AM::getSOImmVal(N->getZExtValue()) != -1;
+    return is_so_imm(N->getZExtValue());
   }
 
   inline bool Pred_t2_so_imm(SDNode *inN) const {
     ConstantSDNode *N = cast<ConstantSDNode>(inN);
-    return ARM_AM::getT2SOImmVal(N->getZExtValue()) != -1;
+    return is_t2_so_imm(N->getZExtValue());
   }
 
   // Include the pieces autogenerated from the target description.
@@ -141,22 +196,30 @@ private:
   /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// loads of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
+  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    unsigned *DOpcodes,
                     unsigned *QOpcodes0, unsigned *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
   /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
+  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    unsigned *DOpcodes,
                     unsigned *QOpcodes0, unsigned *QOpcodes1);
 
   /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
   /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
-  /// load/store of D registers and even subregs and odd subregs of Q registers.
-  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs,
-                          unsigned *DOpcodes, unsigned *QOpcodes0,
-                          unsigned *QOpcodes1);
+  /// load/store of D registers and Q registers.
+  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
+                          bool isUpdating, unsigned NumVecs,
+                          unsigned *DOpcodes, unsigned *QOpcodes);
+
+  /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
+  /// should be 2, 3 or 4.  The opcode array specifies the instructions used
+  /// for loading D registers.  (Q registers are not supported.)
+  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                       unsigned *Opcodes);
 
   /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
   /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
@@ -174,10 +237,10 @@ private:
   SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
                                ARMCC::CondCodes CCVal, SDValue CCR,
                                SDValue InFlag);
-  SDNode *SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+  SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
                               ARMCC::CondCodes CCVal, SDValue CCR,
                               SDValue InFlag);
-  SDNode *SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+  SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
                                ARMCC::CondCodes CCVal, SDValue CCR,
                                SDValue InFlag);
 
@@ -199,9 +262,8 @@ private:
   SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
   SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
 
-  // Form sequences of 8 consecutive D registers.
-  SDNode *OctoDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3,
-                    SDValue V4, SDValue V5, SDValue V6, SDValue V7);
+  // Get the alignment operand for a NEON VLD or VST instruction.
+  SDValue GetVLDSTAlign(SDValue Align, unsigned NumVecs, bool is64BitVector);
 };
 }
 
@@ -229,9 +291,85 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
          isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
+/// \brief Check whether a particular node is a constant value representable as
+/// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax).
+///
+/// \param ScaledConstant [out] - On success, the pre-scaled constant value.
+static bool isScaledConstantInRange(SDValue Node, unsigned Scale,
+                                    int RangeMin, int RangeMax,
+                                    int &ScaledConstant) {
+  assert(Scale && "Invalid scale!");
+
+  // Check that this is a constant.
+  const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node);
+  if (!C)
+    return false;
 
-bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op,
-                                              SDValue N,
+  ScaledConstant = (int) C->getZExtValue();
+  if ((ScaledConstant % Scale) != 0)
+    return false;
+
+  ScaledConstant /= Scale;
+  return ScaledConstant >= RangeMin && ScaledConstant < RangeMax;
+}
+
+/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
+/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
+/// least on current ARM implementations) which should be avoidded.
+bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
+  if (OptLevel == CodeGenOpt::None)
+    return true;
+
+  if (!CheckVMLxHazard)
+    return true;
+
+  if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9())
+    return true;
+
+  if (!N->hasOneUse())
+    return false;
+
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() == ISD::CopyToReg)
+    return true;
+  if (Use->isMachineOpcode()) {
+    const TargetInstrDesc &TID = TII->get(Use->getMachineOpcode());
+    if (TID.mayStore())
+      return true;
+    unsigned Opcode = TID.getOpcode();
+    if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+      return true;
+    // vmlx feeding into another vmlx. We actually want to unfold
+    // the use later in the MLxExpansion pass. e.g.
+    // vmla
+    // vmla (stall 8 cycles)
+    //
+    // vmul (5 cycles)
+    // vadd (5 cycles)
+    // vmla
+    // This adds up to about 18 - 19 cycles.
+    //
+    // vmla
+    // vmul (stall 4 cycles)
+    // vadd adds up to about 14 cycles.
+    return TII->isFpMLxInstruction(Opcode);
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
+                                            ARM_AM::ShiftOpc ShOpcVal,
+                                            unsigned ShAmt) {
+  if (!Subtarget->isCortexA9())
+    return true;
+  if (Shift.hasOneUse())
+    return true;
+  // R << 2 is free.
+  return ShOpcVal == ARM_AM::lsl && ShAmt == 2;
+}
+
+bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
                                               SDValue &BaseReg,
                                               SDValue &ShReg,
                                               SDValue &Opc) {
@@ -251,16 +389,92 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op,
     ShImmVal = RHS->getZExtValue() & 31;
   } else {
     ShReg = N.getOperand(1);
+    if (!isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+      return false;
   }
   Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
                                   MVT::i32);
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N,
-                                      SDValue &Base, SDValue &Offset,
+bool ARMDAGToDAGISel::SelectShiftShifterOperandReg(SDValue N,
+                                                   SDValue &BaseReg,
+                                                   SDValue &ShReg,
+                                                   SDValue &Opc) {
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  // Do not check isShifterOpProfitable. This must return true.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ShReg = CurDAG->getRegister(0, MVT::i32);
+    ShImmVal = RHS->getZExtValue() & 31;
+  } else {
+    ShReg = N.getOperand(1);
+  }
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index.
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+      return true;
+    }
+    
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        !(Subtarget->useMovt() &&
+                     N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+
+bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
                                       SDValue &Opc) {
-  if (N.getOpcode() == ISD::MUL) {
+  if (N.getOpcode() == ISD::MUL &&
+      (!Subtarget->isCortexA9() || N.hasOneUse())) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       // X * [3,5,9] -> X + X * [2,4,8] etc.
       int RHSC = (int)RHS->getZExtValue();
@@ -283,7 +497,114 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N,
     }
   }
 
-  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ISD::ADD.
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Leave simple R +/- imm12 operands for LDRi12
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) // 12 bits.
+      return false;
+  }
+
+  if (Subtarget->isCortexA9() && !N.hasOneUse())
+    // Compute R +/- (R << N) and reuse it.
+    return false;
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        if (!Subtarget->isCortexA9() ||
+            (N.hasOneUse() &&
+             isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+
+
+//-----
+
+AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
+                                                     SDValue &Base,
+                                                     SDValue &Offset,
+                                                     SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL &&
+      (!Subtarget->isCortexA9() || N.hasOneUse())) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return AM2_SHOP;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ADD.
+      !CurDAG->isBaseWithConstantOffset(N)) {
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
@@ -297,36 +618,45 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N,
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
                                                       ARM_AM::no_shift),
                                     MVT::i32);
-    return true;
+    return AM2_BASE;
   }
 
   // Match simple R +/- imm12 operands.
-  if (N.getOpcode() == ISD::ADD)
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getZExtValue();
-      if ((RHSC >= 0 && RHSC < 0x1000) ||
-          (RHSC < 0 && RHSC > -0x1000)) { // 12 bits.
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-        }
-        Offset = CurDAG->getRegister(0, MVT::i32);
+  if (N.getOpcode() != ISD::SUB) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) { // 12 bits.
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      Offset = CurDAG->getRegister(0, MVT::i32);
 
-        ARM_AM::AddrOpc AddSub = ARM_AM::add;
-        if (RHSC < 0) {
-          AddSub = ARM_AM::sub;
-          RHSC = - RHSC;
-        }
-        Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
-                                                          ARM_AM::no_shift),
-                                        MVT::i32);
-        return true;
+      ARM_AM::AddrOpc AddSub = ARM_AM::add;
+      if (RHSC < 0) {
+        AddSub = ARM_AM::sub;
+        RHSC = - RHSC;
       }
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+                                                        ARM_AM::no_shift),
+                                      MVT::i32);
+      return AM2_BASE;
     }
+  }
+
+  if (Subtarget->isCortexA9() && !N.hasOneUse()) {
+    // Compute R +/- (R << N) and reuse it.
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return AM2_BASE;
+  }
 
   // Otherwise this is R +/- [possibly shifted] R.
-  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub;
+  ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
   ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
   unsigned ShAmt = 0;
 
@@ -339,14 +669,20 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N,
     if (ConstantSDNode *Sh =
            dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
       ShAmt = Sh->getZExtValue();
-      Offset = N.getOperand(1).getOperand(0);
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
     } else {
       ShOpcVal = ARM_AM::no_shift;
     }
   }
 
   // Try matching (R shl C) + (R).
-  if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) {
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
     ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
@@ -354,8 +690,15 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N,
       if (ConstantSDNode *Sh =
           dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
         ShAmt = Sh->getZExtValue();
-        Offset = N.getOperand(0).getOperand(0);
-        Base = N.getOperand(1);
+        if (!Subtarget->isCortexA9() ||
+            (N.hasOneUse() &&
+             isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
       } else {
         ShOpcVal = ARM_AM::no_shift;
       }
@@ -364,7 +707,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N,
 
   Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
                                   MVT::i32);
-  return true;
+  return AM2_SHOP;
 }
 
 bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
@@ -375,15 +718,13 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
     : cast<StoreSDNode>(Op)->getAddressingMode();
   ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
     ? ARM_AM::add : ARM_AM::sub;
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
-    int Val = (int)C->getZExtValue();
-    if (Val >= 0 && Val < 0x1000) { // 12 bits.
-      Offset = CurDAG->getRegister(0, MVT::i32);
-      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
-                                                        ARM_AM::no_shift),
-                                      MVT::i32);
-      return true;
-    }
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
   }
 
   Offset = N;
@@ -394,7 +735,12 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
     // it.
     if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       ShAmt = Sh->getZExtValue();
-      Offset = N.getOperand(0);
+      if (isShifterOpProfitable(N, ShOpcVal, ShAmt))
+        Offset = N.getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
     } else {
       ShOpcVal = ARM_AM::no_shift;
     }
@@ -406,7 +752,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
 }
 
 
-bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
                                       SDValue &Base, SDValue &Offset,
                                       SDValue &Opc) {
   if (N.getOpcode() == ISD::SUB) {
@@ -417,7 +763,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N,
     return true;
   }
 
-  if (N.getOpcode() != ISD::ADD) {
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
@@ -429,25 +775,23 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N,
   }
 
   // If the RHS is +/- imm8, fold into addr mode.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int RHSC = (int)RHS->getZExtValue();
-    if ((RHSC >= 0 && RHSC < 256) ||
-        (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
-      Base = N.getOperand(0);
-      if (Base.getOpcode() == ISD::FrameIndex) {
-        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-      }
-      Offset = CurDAG->getRegister(0, MVT::i32);
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                              -256 + 1, 256, RHSC)) { // 8 bits.
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
 
-      ARM_AM::AddrOpc AddSub = ARM_AM::add;
-      if (RHSC < 0) {
-        AddSub = ARM_AM::sub;
-        RHSC = - RHSC;
-      }
-      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32);
-      return true;
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
     }
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32);
+    return true;
   }
 
   Base = N.getOperand(0);
@@ -464,13 +808,11 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
     : cast<StoreSDNode>(Op)->getAddressingMode();
   ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
     ? ARM_AM::add : ARM_AM::sub;
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
-    int Val = (int)C->getZExtValue();
-    if (Val >= 0 && Val < 256) {
-      Offset = CurDAG->getRegister(0, MVT::i32);
-      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32);
-      return true;
-    }
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32);
+    return true;
   }
 
   Offset = N;
@@ -478,16 +820,9 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode4(SDNode *Op, SDValue N,
-                                      SDValue &Addr, SDValue &Mode) {
-  Addr = N;
-  Mode = CurDAG->getTargetConstant(ARM_AM::getAM4ModeImm(ARM_AM::ia), MVT::i32);
-  return true;
-}
-
-bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
                                       SDValue &Base, SDValue &Offset) {
-  if (N.getOpcode() != ISD::ADD) {
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
@@ -503,28 +838,23 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N,
   }
 
   // If the RHS is +/- imm8, fold into addr mode.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int RHSC = (int)RHS->getZExtValue();
-    if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied by 4.
-      RHSC >>= 2;
-      if ((RHSC >= 0 && RHSC < 256) ||
-          (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-        }
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
+                              -256 + 1, 256, RHSC)) {
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
 
-        ARM_AM::AddrOpc AddSub = ARM_AM::add;
-        if (RHSC < 0) {
-          AddSub = ARM_AM::sub;
-          RHSC = - RHSC;
-        }
-        Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
-                                           MVT::i32);
-        return true;
-      }
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
     }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                       MVT::i32);
+    return true;
   }
 
   Base = N;
@@ -533,30 +863,50 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N,
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Op, SDValue N,
-                                      SDValue &Addr, SDValue &Align) {
+bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
+                                      SDValue &Align) {
   Addr = N;
-  // Default to no alignment.
-  Align = CurDAG->getTargetConstant(0, MVT::i32);
+
+  unsigned Alignment = 0;
+  if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(Parent)) {
+    // This case occurs only for VLD1-lane/dup and VST1-lane instructions.
+    // The maximum alignment is equal to the memory size being referenced.
+    unsigned LSNAlign = LSN->getAlignment();
+    unsigned MemSize = LSN->getMemoryVT().getSizeInBits() / 8;
+    if (LSNAlign > MemSize && MemSize > 1)
+      Alignment = MemSize;
+  } else {
+    // All other uses of addrmode6 are for intrinsics.  For now just record
+    // the raw alignment value; it will be refined later based on the legal
+    // alignment operands for the intrinsic.
+    Alignment = cast<MemIntrinsicSDNode>(Parent)->getAlignment();
+  }
+
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectAddrModePC(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
                                        SDValue &Offset, SDValue &Label) {
   if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
     Offset = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
-    Label  = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                                       MVT::i32);
+    Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+                                      MVT::i32);
     return true;
   }
+
   return false;
 }
 
-bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N,
+
+//===----------------------------------------------------------------------===//
+//                         Thumb Addressing Modes
+//===----------------------------------------------------------------------===//
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
                                             SDValue &Base, SDValue &Offset){
-  // FIXME dl should come from the parent load or store, not the address
-  if (N.getOpcode() != ISD::ADD) {
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) {
     ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
     if (!NC || !NC->isNullValue())
       return false;
@@ -571,82 +921,137 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N,
 }
 
 bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDNode *Op, SDValue N,
-                                        unsigned Scale, SDValue &Base,
-                                        SDValue &OffImm, SDValue &Offset) {
+ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base,
+                                       SDValue &Offset, unsigned Scale) {
   if (Scale == 4) {
     SDValue TmpBase, TmpOffImm;
-    if (SelectThumbAddrModeSP(Op, N, TmpBase, TmpOffImm))
+    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
       return false;  // We want to select tLDRspi / tSTRspi instead.
+
     if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
       return false;  // We want to select tLDRpci instead.
   }
 
-  if (N.getOpcode() != ISD::ADD) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Thumb does not have [sp, r] address mode.
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP))
+    return false;
+
+  // FIXME: Why do we explicitly check for a match here and then return false?
+  // Presumably to allow something else to match, but shouldn't this be
+  // documented?
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC))
+    return false;
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 1);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 2);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 4);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
+                                          SDValue &Base, SDValue &OffImm) {
+  if (Scale == 4) {
+    SDValue TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
     if (N.getOpcode() == ARMISD::Wrapper &&
         !(Subtarget->useMovt() &&
           N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
       Base = N.getOperand(0);
-    } else
+    } else {
       Base = N;
+    }
 
-    Offset = CurDAG->getRegister(0, MVT::i32);
     OffImm = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
 
-  // Thumb does not have [sp, r] address mode.
   RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
   RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
   if ((LHSR && LHSR->getReg() == ARM::SP) ||
       (RHSR && RHSR->getReg() == ARM::SP)) {
+    ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0));
+    ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    unsigned LHSC = LHS ? LHS->getZExtValue() : 0;
+    unsigned RHSC = RHS ? RHS->getZExtValue() : 0;
+
+    // Thumb does not have [sp, #imm5] address mode for non-zero imm5.
+    if (LHSC != 0 || RHSC != 0) return false;
+
     Base = N;
-    Offset = CurDAG->getRegister(0, MVT::i32);
     OffImm = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
 
   // If the RHS is + imm5 * scale, fold into addr mode.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int RHSC = (int)RHS->getZExtValue();
-    if ((RHSC & (Scale-1)) == 0) {  // The constant is implicitly multiplied.
-      RHSC /= Scale;
-      if (RHSC >= 0 && RHSC < 32) {
-        Base = N.getOperand(0);
-        Offset = CurDAG->getRegister(0, MVT::i32);
-        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
-        return true;
-      }
-    }
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
+    Base = N.getOperand(0);
+    OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+    return true;
   }
 
   Base = N.getOperand(0);
-  Offset = N.getOperand(1);
   OffImm = CurDAG->getTargetConstant(0, MVT::i32);
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectThumbAddrModeS1(SDNode *Op, SDValue N,
-                                            SDValue &Base, SDValue &OffImm,
-                                            SDValue &Offset) {
-  return SelectThumbAddrModeRI5(Op, N, 1, Base, OffImm, Offset);
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 4, Base, OffImm);
 }
 
-bool ARMDAGToDAGISel::SelectThumbAddrModeS2(SDNode *Op, SDValue N,
-                                            SDValue &Base, SDValue &OffImm,
-                                            SDValue &Offset) {
-  return SelectThumbAddrModeRI5(Op, N, 2, Base, OffImm, Offset);
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 2, Base, OffImm);
 }
 
-bool ARMDAGToDAGISel::SelectThumbAddrModeS4(SDNode *Op, SDValue N,
-                                            SDValue &Base, SDValue &OffImm,
-                                            SDValue &Offset) {
-  return SelectThumbAddrModeRI5(Op, N, 4, Base, OffImm, Offset);
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 1, Base, OffImm);
 }
 
-bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDNode *Op, SDValue N,
-                                           SDValue &Base, SDValue &OffImm) {
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
@@ -654,35 +1059,35 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDNode *Op, SDValue N,
     return true;
   }
 
-  if (N.getOpcode() != ISD::ADD)
+  if (!CurDAG->isBaseWithConstantOffset(N))
     return false;
 
   RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
   if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
       (LHSR && LHSR->getReg() == ARM::SP)) {
     // If the RHS is + imm8 * scale, fold into addr mode.
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getZExtValue();
-      if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied.
-        RHSC >>= 2;
-        if (RHSC >= 0 && RHSC < 256) {
-          Base = N.getOperand(0);
-          if (Base.getOpcode() == ISD::FrameIndex) {
-            int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-            Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-          }
-          OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
-          return true;
-        }
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
       }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
     }
   }
 
   return false;
 }
 
-bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDNode *Op, SDValue N,
-                                                SDValue &BaseReg,
+
+//===----------------------------------------------------------------------===//
+//                        Thumb 2 Addressing Modes
+//===----------------------------------------------------------------------===//
+
+
+bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg,
                                                 SDValue &Opc) {
   if (DisableShifterOp)
     return false;
@@ -704,19 +1109,22 @@ bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDNode *Op, SDValue N,
   return false;
 }
 
-bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
                                             SDValue &Base, SDValue &OffImm) {
   // Match simple R + imm12 operands.
 
   // Base only.
-  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
     if (N.getOpcode() == ISD::FrameIndex) {
-      // Match frame index...
+      // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
       Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
-    } else if (N.getOpcode() == ARMISD::Wrapper &&
+    }
+    
+    if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
       Base = N.getOperand(0);
@@ -729,7 +1137,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N,
   }
 
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    if (SelectT2AddrModeImm8(Op, N, Base, OffImm))
+    if (SelectT2AddrModeImm8(N, Base, OffImm))
       // Let t2LDRi8 handle (R - imm8).
       return false;
 
@@ -754,24 +1162,26 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N,
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
                                            SDValue &Base, SDValue &OffImm) {
   // Match simple R - imm8 operands.
-  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::SUB) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getSExtValue();
-      if (N.getOpcode() == ISD::SUB)
-        RHSC = -RHSC;
-
-      if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
-        }
-        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
-        return true;
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getSExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
       }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
     }
   }
 
@@ -784,52 +1194,22 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
   ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
     ? cast<LoadSDNode>(Op)->getAddressingMode()
     : cast<StoreSDNode>(Op)->getAddressingMode();
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N)) {
-    int RHSC = (int)RHS->getZExtValue();
-    if (RHSC >= 0 && RHSC < 0x100) { // 8 bits.
-      OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
-        ? CurDAG->getTargetConstant(RHSC, MVT::i32)
-        : CurDAG->getTargetConstant(-RHSC, MVT::i32);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N,
-                                             SDValue &Base, SDValue &OffImm) {
-  if (N.getOpcode() == ISD::ADD) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getZExtValue();
-      // 8 bits.
-      if (((RHSC & 0x3) == 0) &&
-          ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) {
-        Base   = N.getOperand(0);
-        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
-        return true;
-      }
-    }
-  } else if (N.getOpcode() == ISD::SUB) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int RHSC = (int)RHS->getZExtValue();
-      // 8 bits.
-      if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) {
-        Base   = N.getOperand(0);
-        OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32);
-        return true;
-      }
-    }
+  int RHSC;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits.
+    OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+      ? CurDAG->getTargetConstant(RHSC, MVT::i32)
+      : CurDAG->getTargetConstant(-RHSC, MVT::i32);
+    return true;
   }
 
   return false;
 }
 
-bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
                                             SDValue &Base,
                                             SDValue &OffReg, SDValue &ShImm) {
   // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
-  if (N.getOpcode() != ISD::ADD)
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N))
     return false;
 
   // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
@@ -841,6 +1221,12 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N,
       return false;
   }
 
+  if (Subtarget->isCortexA9() && !N.hasOneUse()) {
+    // Compute R + (R << [1,2,3]) and reuse it.
+    Base = N;
+    return false;
+  }
+
   // Look for (R + R) or (R + (R << [1,2,3])).
   unsigned ShAmt = 0;
   Base   = N.getOperand(0);
@@ -859,11 +1245,12 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N,
     // it.
     if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) {
       ShAmt = Sh->getZExtValue();
-      if (ShAmt >= 4) {
+      if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt))
+        OffReg = OffReg.getOperand(0);
+      else {
         ShAmt = 0;
         ShOpcVal = ARM_AM::no_shift;
-      } else
-        OffReg = OffReg.getOperand(0);
+      }
     } else {
       ShOpcVal = ARM_AM::no_shift;
     }
@@ -1045,52 +1432,43 @@ SDNode *ARMDAGToDAGISel::QuadQRegs(EVT VT, SDValue V0, SDValue V1,
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8);
 }
 
-/// OctoDRegs - Form 8 consecutive D registers.
-///
-SDNode *ARMDAGToDAGISel::OctoDRegs(EVT VT, SDValue V0, SDValue V1,
-                                   SDValue V2, SDValue V3,
-                                   SDValue V4, SDValue V5,
-                                   SDValue V6, SDValue V7) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
-  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
-  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
-  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32);
-  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32);
-  SDValue SubReg4 = CurDAG->getTargetConstant(ARM::dsub_4, MVT::i32);
-  SDValue SubReg5 = CurDAG->getTargetConstant(ARM::dsub_5, MVT::i32);
-  SDValue SubReg6 = CurDAG->getTargetConstant(ARM::dsub_6, MVT::i32);
-  SDValue SubReg7 = CurDAG->getTargetConstant(ARM::dsub_7, MVT::i32);
-  const SDValue Ops[] ={ V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3,
-                         V4, SubReg4, V5, SubReg5, V6, SubReg6, V7, SubReg7 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 16);
+/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
+/// of a NEON VLD or VST instruction.  The supported values depend on the
+/// number of registers being loaded.
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs,
+                                       bool is64BitVector) {
+  unsigned NumRegs = NumVecs;
+  if (!is64BitVector && NumVecs < 3)
+    NumRegs *= 2;
+
+  unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+  if (Alignment >= 32 && NumRegs == 4)
+    Alignment = 32;
+  else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4))
+    Alignment = 16;
+  else if (Alignment >= 8)
+    Alignment = 8;
+  else
+    Alignment = 0;
+
+  return CurDAG->getTargetConstant(Alignment, MVT::i32);
 }
 
-/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type
-/// for a 64-bit subregister of the vector.
-static EVT GetNEONSubregVT(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("unhandled NEON type");
-  case MVT::v16i8: return MVT::v8i8;
-  case MVT::v8i16: return MVT::v4i16;
-  case MVT::v4f32: return MVT::v2f32;
-  case MVT::v4i32: return MVT::v2i32;
-  case MVT::v2i64: return MVT::v1i64;
-  }
-}
-
-SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
+SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
-  if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return NULL;
 
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
   bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
 
   unsigned OpcodeIndex;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -1120,88 +1498,97 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
       ResTyElts *= 2;
     ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
   }
+  std::vector<EVT> ResTys;
+  ResTys.push_back(ResTy);
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
 
   SDValue Pred = getAL(CurDAG);
   SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
-  SDValue SuperReg;
-  if (is64BitVector) {
-    unsigned Opc = DOpcodes[OpcodeIndex];
-    const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
-    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
-    if (NumVecs == 1)
-      return VLd;
-
-    SuperReg = SDValue(VLd, 0);
-    assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-      SDValue D = CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec,
-                                                 dl, VT, SuperReg);
-      ReplaceUses(SDValue(N, Vec), D);
-    }
-    ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
-    return NULL;
-  }
-
-  if (NumVecs <= 2) {
-    // Quad registers are directly supported for VLD1 and VLD2,
-    // loading pairs of D regs.
-    unsigned Opc = QOpcodes0[OpcodeIndex];
-    const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
-    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
-    if (NumVecs == 1)
-      return VLd;
+  SDNode *VLd;
+  SmallVector<SDValue, 7> Ops;
 
-    SuperReg = SDValue(VLd, 0);
-    Chain = SDValue(VLd, 1);
+  // Double registers and VLD1/VLD2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+    }
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
 
   } else {
     // Otherwise, quad registers are loaded with two separate instructions,
     // where one loads the even registers and the other loads the odd registers.
     EVT AddrTy = MemAddr.getValueType();
 
-    // Load the even subregs.
-    unsigned Opc = QOpcodes0[OpcodeIndex];
+    // Load the even subregs.  This is always an updating load, so that it
+    // provides the address to the second load for the odd subregs.
     SDValue ImplDef =
       SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
     const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
-    SDNode *VLdA =
-      CurDAG->getMachineNode(Opc, dl, ResTy, AddrTy, MVT::Other, OpsA, 7);
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                          ResTy, AddrTy, MVT::Other, OpsA, 7);
     Chain = SDValue(VLdA, 2);
 
     // Load the odd subregs.
-    Opc = QOpcodes1[OpcodeIndex];
-    const SDValue OpsB[] = { SDValue(VLdA, 1), Align, Reg0, SDValue(VLdA, 0),
-                             Pred, Reg0, Chain };
-    SDNode *VLdB =
-      CurDAG->getMachineNode(Opc, dl, ResTy, AddrTy, MVT::Other, OpsB, 7);
-    SuperReg = SDValue(VLdB, 0);
-    Chain = SDValue(VLdB, 2);
-  }
-
-  // Extract out the Q registers.
-  assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-    SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
-                                               dl, VT, SuperReg);
-    ReplaceUses(SDValue(N, Vec), Q);
-  }
-  ReplaceUses(SDValue(N, NumVecs), Chain);
+    Ops.push_back(SDValue(VLdA, 1));
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      assert(isa<ConstantSDNode>(Inc.getNode()) &&
+             "only constant post-increment update allowed for VLD3/4");
+      (void)Inc;
+      Ops.push_back(Reg0);
+    }
+    Ops.push_back(SDValue(VLdA, 0));
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                 Ops.data(), Ops.size());
+  }
+
+  if (NumVecs == 1)
+    return VLd;
+
+  // Extract out the subregisters.
+  SDValue SuperReg = SDValue(VLd, 0);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
   return NULL;
 }
 
-SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
+SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
-  if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return NULL;
 
   SDValue Chain = N->getOperand(0);
-  EVT VT = N->getOperand(3).getValueType();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
   bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
 
   unsigned OpcodeIndex;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -1222,119 +1609,128 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
     break;
   }
 
+  std::vector<EVT> ResTys;
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
   SDValue Pred = getAL(CurDAG);
   SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
-
   SmallVector<SDValue, 7> Ops;
-  Ops.push_back(MemAddr);
-  Ops.push_back(Align);
 
-  if (is64BitVector) {
+  // Double registers and VST1/VST2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    SDValue SrcReg;
     if (NumVecs == 1) {
-      Ops.push_back(N->getOperand(3));
-    } else {
-      SDValue RegSeq;
-      SDValue V0 = N->getOperand(0+3);
-      SDValue V1 = N->getOperand(1+3);
-
+      SrcReg = N->getOperand(Vec0Idx);
+    } else if (is64BitVector) {
       // Form a REG_SEQUENCE to force register allocation.
+      SDValue V0 = N->getOperand(Vec0Idx + 0);
+      SDValue V1 = N->getOperand(Vec0Idx + 1);
       if (NumVecs == 2)
-        RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
+        SrcReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
       else {
-        SDValue V2 = N->getOperand(2+3);
-        // If it's a vld3, form a quad D-register and leave the last part as 
+        SDValue V2 = N->getOperand(Vec0Idx + 2);
+        // If it's a vst3, form a quad D-register and leave the last part as
         // an undef.
         SDValue V3 = (NumVecs == 3)
           ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : N->getOperand(3+3);
-        RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
+          : N->getOperand(Vec0Idx + 3);
+        SrcReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
       }
-      Ops.push_back(RegSeq);
-    }
-    Ops.push_back(Pred);
-    Ops.push_back(Reg0); // predicate register
-    Ops.push_back(Chain);
-    unsigned Opc = DOpcodes[OpcodeIndex];
-    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 6);
-  }
-
-  if (NumVecs <= 2) {
-    // Quad registers are directly supported for VST1 and VST2.
-    unsigned Opc = QOpcodes0[OpcodeIndex];
-    if (NumVecs == 1) {
-      Ops.push_back(N->getOperand(3));
     } else {
       // Form a QQ register.
-      SDValue Q0 = N->getOperand(3);
-      SDValue Q1 = N->getOperand(4);
-      Ops.push_back(SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0));
+      SDValue Q0 = N->getOperand(Vec0Idx);
+      SDValue Q1 = N->getOperand(Vec0Idx + 1);
+      SrcReg = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0);
+    }
+
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
     }
+    Ops.push_back(SrcReg);
     Ops.push_back(Pred);
-    Ops.push_back(Reg0); // predicate register
+    Ops.push_back(Reg0);
     Ops.push_back(Chain);
-    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 6);
+    return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
   // where one stores the even registers and the other stores the odd registers.
 
   // Form the QQQQ REG_SEQUENCE.
-  SDValue V0 = N->getOperand(0+3);
-  SDValue V1 = N->getOperand(1+3);
-  SDValue V2 = N->getOperand(2+3);
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  SDValue V2 = N->getOperand(Vec0Idx + 2);
   SDValue V3 = (NumVecs == 3)
     ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
-    : N->getOperand(3+3);
+    : N->getOperand(Vec0Idx + 3);
   SDValue RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
 
-  // Store the even D registers.
-  Ops.push_back(Reg0); // post-access address offset
-  Ops.push_back(RegSeq);
-  Ops.push_back(Pred);
-  Ops.push_back(Reg0); // predicate register
-  Ops.push_back(Chain);
-  unsigned Opc = QOpcodes0[OpcodeIndex];
-  SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                        MVT::Other, Ops.data(), 7);
+  // Store the even D registers.  This is always an updating store, so that it
+  // provides the address to the second store for the odd subregs.
+  const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain };
+  SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                        MemAddr.getValueType(),
+                                        MVT::Other, OpsA, 7);
   Chain = SDValue(VStA, 1);
 
   // Store the odd D registers.
-  Ops[0] = SDValue(VStA, 0); // MemAddr
-  Ops[6] = Chain;
-  Opc = QOpcodes1[OpcodeIndex];
-  SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                        MVT::Other, Ops.data(), 7);
-  Chain = SDValue(VStB, 1);
-  ReplaceUses(SDValue(N, 0), Chain);
-  return NULL;
+  Ops.push_back(SDValue(VStA, 0));
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    assert(isa<ConstantSDNode>(Inc.getNode()) &&
+           "only constant post-increment update allowed for VST3/4");
+    (void)Inc;
+    Ops.push_back(Reg0);
+  }
+  Ops.push_back(RegSeq);
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+  return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                Ops.data(), Ops.size());
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
-                                         unsigned NumVecs, unsigned *DOpcodes,
-                                         unsigned *QOpcodes0,
-                                         unsigned *QOpcodes1) {
+                                         bool isUpdating, unsigned NumVecs,
+                                         unsigned *DOpcodes,
+                                         unsigned *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
-  if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return NULL;
 
   SDValue Chain = N->getOperand(0);
   unsigned Lane =
-    cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue();
-  EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType();
+    cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
   bool is64BitVector = VT.is64BitVector();
 
-  // Quad registers are handled by load/store of subregs. Find the subreg info.
-  unsigned NumElts = 0;
-  bool Even = false;
-  EVT RegVT = VT;
-  if (!is64BitVector) {
-    RegVT = GetNEONSubregVT(VT);
-    NumElts = RegVT.getVectorNumElements();
-    Even = Lane < NumElts;
-  }
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
 
   unsigned OpcodeIndex;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -1350,124 +1746,144 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   case MVT::v4i32: OpcodeIndex = 1; break;
   }
 
+  std::vector<EVT> ResTys;
+  if (IsLoad) {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(),
+                                      MVT::i64, ResTyElts));
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
   SDValue Pred = getAL(CurDAG);
   SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
 
-  SmallVector<SDValue, 10> Ops;
+  SmallVector<SDValue, 8> Ops;
   Ops.push_back(MemAddr);
   Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+  }
 
-  unsigned Opc = 0;
-  if (is64BitVector) {
-    Opc = DOpcodes[OpcodeIndex];
-    SDValue RegSeq;
-    SDValue V0 = N->getOperand(0+3);
-    SDValue V1 = N->getOperand(1+3);
-    if (NumVecs == 2) {
-      RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
-    } else {
-      SDValue V2 = N->getOperand(2+3);
-      SDValue V3 = (NumVecs == 3)
-        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-        : N->getOperand(3+3);
-      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
-    }
-
-    // Now extract the D registers back out.
-    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq));
-    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq));
-    if (NumVecs > 2)
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,RegSeq));
-    if (NumVecs > 3)
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,RegSeq));
+  SDValue SuperReg;
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  if (NumVecs == 2) {
+    if (is64BitVector)
+      SuperReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
+    else
+      SuperReg = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
   } else {
-    // Check if this is loading the even or odd subreg of a Q register.
-    if (Lane < NumElts) {
-      Opc = QOpcodes0[OpcodeIndex];
-    } else {
-      Lane -= NumElts;
-      Opc = QOpcodes1[OpcodeIndex];
-    }
-
-    SDValue RegSeq;
-    SDValue V0 = N->getOperand(0+3);
-    SDValue V1 = N->getOperand(1+3);
-    if (NumVecs == 2) {
-      RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
-    } else {
-      SDValue V2 = N->getOperand(2+3);
-      SDValue V3 = (NumVecs == 3)
-        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-        : N->getOperand(3+3);
-      RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
-    }
-
-    // Extract the subregs of the input vector.
-    unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1;
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT,
-                                                   RegSeq));
+    SDValue V2 = N->getOperand(Vec0Idx + 2);
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(Vec0Idx + 3);
+    if (is64BitVector)
+      SuperReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
+    else
+      SuperReg = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
   }
+  Ops.push_back(SuperReg);
   Ops.push_back(getI32Imm(Lane));
   Ops.push_back(Pred);
   Ops.push_back(Reg0);
   Ops.push_back(Chain);
 
+  unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                                  QOpcodes[OpcodeIndex]);
+  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys,
+                                         Ops.data(), Ops.size());
   if (!IsLoad)
-    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+6);
+    return VLdLn;
 
-  std::vector<EVT> ResTys(NumVecs, RegVT);
-  ResTys.push_back(MVT::Other);
-  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(),NumVecs+6);
+  // Extract the subregisters.
+  SuperReg = SDValue(VLdLn, 0);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+  return NULL;
+}
 
-  // Form a REG_SEQUENCE to force register allocation.
-  SDValue RegSeq;
-  if (is64BitVector) {
-    SDValue V0 = SDValue(VLdLn, 0);
-    SDValue V1 = SDValue(VLdLn, 1);
-    if (NumVecs == 2) {
-      RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
-    } else {
-      SDValue V2 = SDValue(VLdLn, 2);
-      // If it's a vld3, form a quad D-register but discard the last part.
-      SDValue V3 = (NumVecs == 3)
-        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-        : SDValue(VLdLn, 3);
-      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
-    }
-  } else {
-    // For 128-bit vectors, take the 64-bit results of the load and insert
-    // them as subregs into the result.
-    SDValue V[8];
-    for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
-      if (Even) {
-        V[i]   = SDValue(VLdLn, Vec);
-        V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                dl, RegVT), 0);
-      } else {
-        V[i]   = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                dl, RegVT), 0);
-        V[i+1] = SDValue(VLdLn, Vec);
-      }
-    }
-    if (NumVecs == 3)
-      V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                   dl, RegVT), 0);
+SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+                                      unsigned NumVecs, unsigned *Opcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
 
-    if (NumVecs == 2)
-      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0);
-    else
-      RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
-                                 V[4], V[5], V[6], V[7]), 0);
+  SDValue MemAddr, Align;
+  if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld-dup type");
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  }
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDValue SuperReg;
+  unsigned Opc = Opcodes[OpcodeIndex];
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(2);
+    Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
   }
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
 
+  unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+  std::vector<EVT> ResTys;
+  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts));
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+  SDNode *VLdDup =
+    CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+  SuperReg = SDValue(VLdDup, 0);
+
+  // Extract the subregisters.
   assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-  assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
-  unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+  unsigned SubIdx = ARM::dsub_0;
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq));
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs));
+                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
   return NULL;
 }
 
@@ -1486,7 +1902,7 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
     RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0);
   else {
     SDValue V2 = N->getOperand(FirstTblReg + 2);
-    // If it's a vtbl3, form a quad D-register and leave the last part as 
+    // If it's a vtbl3, form a quad D-register and leave the last part as
     // an undef.
     SDValue V3 = (NumVecs == 3)
       ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
@@ -1494,17 +1910,10 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
     RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
   }
 
-  // Now extract the D registers back out.
   SmallVector<SDValue, 6> Ops;
   if (IsExt)
     Ops.push_back(N->getOperand(1));
-  Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq));
-  Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq));
-  if (NumVecs > 2)
-    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, RegSeq));
-  if (NumVecs > 3)
-    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, RegSeq));
-
+  Ops.push_back(RegSeq);
   Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
   Ops.push_back(getAL(CurDAG)); // predicate
   Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
@@ -1574,7 +1983,7 @@ SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
   SDValue CPTmp0;
   SDValue CPTmp1;
-  if (SelectT2ShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1)) {
+  if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) {
     unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
     unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
     unsigned Opc = 0;
@@ -1602,7 +2011,7 @@ SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
   SDValue CPTmp0;
   SDValue CPTmp1;
   SDValue CPTmp2;
-  if (SelectShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
+  if (SelectShifterOperandReg(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
     SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
     SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
     return CurDAG->SelectNodeTo(N, ARM::MOVCCs, MVT::i32, Ops, 7);
@@ -1611,36 +2020,66 @@ SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
 }
 
 SDNode *ARMDAGToDAGISel::
-SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                    ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                  ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
   ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
   if (!T)
     return 0;
 
-  if (Pred_t2_so_imm(TrueVal.getNode())) {
-    SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32);
+  unsigned Opc = 0;
+  unsigned TrueImm = T->getZExtValue();
+  if (is_t2_so_imm(TrueImm)) {
+    Opc = ARM::t2MOVCCi;
+  } else if (TrueImm <= 0xffff) {
+    Opc = ARM::t2MOVCCi16;
+  } else if (is_t2_so_imm_not(TrueImm)) {
+    TrueImm = ~TrueImm;
+    Opc = ARM::t2MVNCCi;
+  } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) {
+    // Large immediate.
+    Opc = ARM::t2MOVCCi32imm;
+  }
+
+  if (Opc) {
+    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
     SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
     SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N,
-                                ARM::t2MOVCCi, MVT::i32, Ops, 5);
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
   }
+
   return 0;
 }
 
 SDNode *ARMDAGToDAGISel::
-SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                   ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
   ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
   if (!T)
     return 0;
 
-  if (Pred_so_imm(TrueVal.getNode())) {
-    SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32);
+  unsigned Opc = 0;
+  unsigned TrueImm = T->getZExtValue();
+  bool isSoImm = is_so_imm(TrueImm);
+  if (isSoImm) {
+    Opc = ARM::MOVCCi;
+  } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) {
+    Opc = ARM::MOVCCi16;
+  } else if (is_so_imm_not(TrueImm)) {
+    TrueImm = ~TrueImm;
+    Opc = ARM::MVNCCi;
+  } else if (TrueVal.getNode()->hasOneUse() &&
+             (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) {
+    // Large immediate.
+    Opc = ARM::MOVCCi32imm;
+  }
+
+  if (Opc) {
+    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
     SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
     SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N,
-                                ARM::MOVCCi, MVT::i32, Ops, 5);
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
   }
+
   return 0;
 }
 
@@ -1688,18 +2127,18 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
     //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
     // Pattern complexity = 10  cost = 1  size = 0
     if (Subtarget->isThumb()) {
-      SDNode *Res = SelectT2CMOVSoImmOp(N, FalseVal, TrueVal,
+      SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal,
                                         CCVal, CCR, InFlag);
       if (!Res)
-        Res = SelectT2CMOVSoImmOp(N, TrueVal, FalseVal,
+        Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal,
                                ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
       if (Res)
         return Res;
     } else {
-      SDNode *Res = SelectARMCMOVSoImmOp(N, FalseVal, TrueVal,
+      SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal,
                                          CCVal, CCR, InFlag);
       if (!Res)
-        Res = SelectARMCMOVSoImmOp(N, TrueVal, FalseVal,
+        Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal,
                                ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
       if (Res)
         return Res;
@@ -1742,13 +2181,7 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   EVT VT = N->getValueType(0);
   if (!VT.is128BitVector() || N->getNumOperands() != 2)
     llvm_unreachable("unexpected CONCAT_VECTORS");
-  DebugLoc dl = N->getDebugLoc();
-  SDValue V0 = N->getOperand(0);
-  SDValue V1 = N->getOperand(1);
-  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
-  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
-  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
+  return PairDRegs(VT, N->getOperand(0), N->getOperand(1));
 }
 
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
@@ -1788,19 +2221,18 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Pred = getAL(CurDAG);
         SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
         SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
-        ResNode = CurDAG->getMachineNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other,
+        ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
                                          Ops, 4);
       } else {
         SDValue Ops[] = {
           CPIdx,
-          CurDAG->getRegister(0, MVT::i32),
           CurDAG->getTargetConstant(0, MVT::i32),
           getAL(CurDAG),
           CurDAG->getRegister(0, MVT::i32),
           CurDAG->getEntryNode()
         };
         ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
-                                       Ops, 6);
+                                       Ops, 5);
       }
       ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
       return NULL;
@@ -1930,7 +2362,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                    ARM::UMULL : ARM::UMULLv5,
+                                    dl, MVT::i32, MVT::i32, Ops, 5);
     }
   }
   case ISD::SMUL_LOHI: {
@@ -1944,7 +2378,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                    ARM::SMULL : ARM::SMULLv5,
+                                    dl, MVT::i32, MVT::i32, Ops, 5);
     }
   }
   case ISD::LOAD: {
@@ -1987,7 +2423,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                MVT::i32);
     SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
     SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
-                                             MVT::Flag, Ops, 5);
+                                             MVT::Glue, Ops, 5);
     Chain = SDValue(ResNode, 0);
     if (N->getNumValues() == 2) {
       InFlag = SDValue(ResNode, 1);
@@ -2088,12 +2524,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     EVT VecVT = N->getValueType(0);
     EVT EltVT = VecVT.getVectorElementType();
     unsigned NumElts = VecVT.getVectorNumElements();
-    if (EltVT.getSimpleVT() == MVT::f64) {
+    if (EltVT == MVT::f64) {
       assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
       return PairDRegs(VecVT, N->getOperand(0), N->getOperand(1));
     }
-    assert(EltVT.getSimpleVT() == MVT::f32 &&
-           "unexpected type for BUILD_VECTOR");
+    assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
     if (NumElts == 2)
       return PairSRegs(VecVT, N->getOperand(0), N->getOperand(1));
     assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
@@ -2101,6 +2536,170 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                      N->getOperand(2), N->getOperand(3));
   }
 
+  case ARMISD::VLD2DUP: {
+    unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo,
+                           ARM::VLD2DUPd32Pseudo };
+    return SelectVLDDup(N, false, 2, Opcodes);
+  }
+
+  case ARMISD::VLD3DUP: {
+    unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo,
+                           ARM::VLD3DUPd32Pseudo };
+    return SelectVLDDup(N, false, 3, Opcodes);
+  }
+
+  case ARMISD::VLD4DUP: {
+    unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo,
+                           ARM::VLD4DUPd32Pseudo };
+    return SelectVLDDup(N, false, 4, Opcodes);
+  }
+
+  case ARMISD::VLD2DUP_UPD: {
+    unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd16Pseudo_UPD,
+                           ARM::VLD2DUPd32Pseudo_UPD };
+    return SelectVLDDup(N, true, 2, Opcodes);
+  }
+
+  case ARMISD::VLD3DUP_UPD: {
+    unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD,
+                           ARM::VLD3DUPd32Pseudo_UPD };
+    return SelectVLDDup(N, true, 3, Opcodes);
+  }
+
+  case ARMISD::VLD4DUP_UPD: {
+    unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD,
+                           ARM::VLD4DUPd32Pseudo_UPD };
+    return SelectVLDDup(N, true, 4, Opcodes);
+  }
+
+  case ARMISD::VLD1_UPD: {
+    unsigned DOpcodes[] = { ARM::VLD1d8_UPD, ARM::VLD1d16_UPD,
+                            ARM::VLD1d32_UPD, ARM::VLD1d64_UPD };
+    unsigned QOpcodes[] = { ARM::VLD1q8Pseudo_UPD, ARM::VLD1q16Pseudo_UPD,
+                            ARM::VLD1q32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD };
+    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VLD2_UPD: {
+    unsigned DOpcodes[] = { ARM::VLD2d8Pseudo_UPD, ARM::VLD2d16Pseudo_UPD,
+                            ARM::VLD2d32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q16Pseudo_UPD,
+                            ARM::VLD2q32Pseudo_UPD };
+    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VLD3_UPD: {
+    unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD,
+                            ARM::VLD3d32Pseudo_UPD, ARM::VLD1d64TPseudo_UPD };
+    unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                             ARM::VLD3q16Pseudo_UPD,
+                             ARM::VLD3q32Pseudo_UPD };
+    unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+                             ARM::VLD3q16oddPseudo_UPD,
+                             ARM::VLD3q32oddPseudo_UPD };
+    return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VLD4_UPD: {
+    unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD,
+                            ARM::VLD4d32Pseudo_UPD, ARM::VLD1d64QPseudo_UPD };
+    unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                             ARM::VLD4q16Pseudo_UPD,
+                             ARM::VLD4q32Pseudo_UPD };
+    unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+                             ARM::VLD4q16oddPseudo_UPD,
+                             ARM::VLD4q32oddPseudo_UPD };
+    return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VLD2LN_UPD: {
+    unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD,
+                            ARM::VLD2LNd32Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+                            ARM::VLD2LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VLD3LN_UPD: {
+    unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD,
+                            ARM::VLD3LNd32Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+                            ARM::VLD3LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VLD4LN_UPD: {
+    unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD,
+                            ARM::VLD4LNd32Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+                            ARM::VLD4LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST1_UPD: {
+    unsigned DOpcodes[] = { ARM::VST1d8_UPD, ARM::VST1d16_UPD,
+                            ARM::VST1d32_UPD, ARM::VST1d64_UPD };
+    unsigned QOpcodes[] = { ARM::VST1q8Pseudo_UPD, ARM::VST1q16Pseudo_UPD,
+                            ARM::VST1q32Pseudo_UPD, ARM::VST1q64Pseudo_UPD };
+    return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VST2_UPD: {
+    unsigned DOpcodes[] = { ARM::VST2d8Pseudo_UPD, ARM::VST2d16Pseudo_UPD,
+                            ARM::VST2d32Pseudo_UPD, ARM::VST1q64Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VST2q8Pseudo_UPD, ARM::VST2q16Pseudo_UPD,
+                            ARM::VST2q32Pseudo_UPD };
+    return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VST3_UPD: {
+    unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD,
+                            ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudo_UPD };
+    unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                             ARM::VST3q16Pseudo_UPD,
+                             ARM::VST3q32Pseudo_UPD };
+    unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+                             ARM::VST3q16oddPseudo_UPD,
+                             ARM::VST3q32oddPseudo_UPD };
+    return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VST4_UPD: {
+    unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD,
+                            ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudo_UPD };
+    unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                             ARM::VST4q16Pseudo_UPD,
+                             ARM::VST4q32Pseudo_UPD };
+    unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+                             ARM::VST4q16oddPseudo_UPD,
+                             ARM::VST4q32oddPseudo_UPD };
+    return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VST2LN_UPD: {
+    unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD,
+                            ARM::VST2LNd32Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+                            ARM::VST2LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST3LN_UPD: {
+    unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD,
+                            ARM::VST3LNd32Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+                            ARM::VST3LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST4LN_UPD: {
+    unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD,
+                            ARM::VST4LNd32Pseudo_UPD };
+    unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+                            ARM::VST4LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+  }
+
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -2113,7 +2712,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                               ARM::VLD1d32, ARM::VLD1d64 };
       unsigned QOpcodes[] = { ARM::VLD1q8Pseudo, ARM::VLD1q16Pseudo,
                               ARM::VLD1q32Pseudo, ARM::VLD1q64Pseudo };
-      return SelectVLD(N, 1, DOpcodes, QOpcodes, 0);
+      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld2: {
@@ -2121,7 +2720,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                               ARM::VLD2d32Pseudo, ARM::VLD1q64Pseudo };
       unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
                               ARM::VLD2q32Pseudo };
-      return SelectVLD(N, 2, DOpcodes, QOpcodes, 0);
+      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld3: {
@@ -2130,10 +2729,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
                                ARM::VLD3q16Pseudo_UPD,
                                ARM::VLD3q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
-                               ARM::VLD3q16oddPseudo_UPD,
-                               ARM::VLD3q32oddPseudo_UPD };
-      return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo,
+                               ARM::VLD3q16oddPseudo,
+                               ARM::VLD3q32oddPseudo };
+      return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vld4: {
@@ -2142,31 +2741,31 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
                                ARM::VLD4q16Pseudo_UPD,
                                ARM::VLD4q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
-                               ARM::VLD4q16oddPseudo_UPD,
-                               ARM::VLD4q32oddPseudo_UPD };
-      return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo,
+                               ARM::VLD4q16oddPseudo,
+                               ARM::VLD4q32oddPseudo };
+      return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vld2lane: {
-      unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 };
-      unsigned QOpcodes0[] = { ARM::VLD2LNq16, ARM::VLD2LNq32 };
-      unsigned QOpcodes1[] = { ARM::VLD2LNq16odd, ARM::VLD2LNq32odd };
-      return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo,
+                              ARM::VLD2LNd32Pseudo };
+      unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vld3lane: {
-      unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 };
-      unsigned QOpcodes0[] = { ARM::VLD3LNq16, ARM::VLD3LNq32 };
-      unsigned QOpcodes1[] = { ARM::VLD3LNq16odd, ARM::VLD3LNq32odd };
-      return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo,
+                              ARM::VLD3LNd32Pseudo };
+      unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vld4lane: {
-      unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 };
-      unsigned QOpcodes0[] = { ARM::VLD4LNq16, ARM::VLD4LNq32 };
-      unsigned QOpcodes1[] = { ARM::VLD4LNq16odd, ARM::VLD4LNq32odd };
-      return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo,
+                              ARM::VLD4LNd32Pseudo };
+      unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst1: {
@@ -2174,7 +2773,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                               ARM::VST1d32, ARM::VST1d64 };
       unsigned QOpcodes[] = { ARM::VST1q8Pseudo, ARM::VST1q16Pseudo,
                               ARM::VST1q32Pseudo, ARM::VST1q64Pseudo };
-      return SelectVST(N, 1, DOpcodes, QOpcodes, 0);
+      return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst2: {
@@ -2182,7 +2781,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                               ARM::VST2d32Pseudo, ARM::VST1q64Pseudo };
       unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
                               ARM::VST2q32Pseudo };
-      return SelectVST(N, 2, DOpcodes, QOpcodes, 0);
+      return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst3: {
@@ -2191,10 +2790,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
                                ARM::VST3q16Pseudo_UPD,
                                ARM::VST3q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
-                               ARM::VST3q16oddPseudo_UPD,
-                               ARM::VST3q32oddPseudo_UPD };
-      return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo,
+                               ARM::VST3q16oddPseudo,
+                               ARM::VST3q32oddPseudo };
+      return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vst4: {
@@ -2203,31 +2802,31 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
                                ARM::VST4q16Pseudo_UPD,
                                ARM::VST4q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
-                               ARM::VST4q16oddPseudo_UPD,
-                               ARM::VST4q32oddPseudo_UPD };
-      return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo,
+                               ARM::VST4q16oddPseudo,
+                               ARM::VST4q32oddPseudo };
+      return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vst2lane: {
-      unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 };
-      unsigned QOpcodes0[] = { ARM::VST2LNq16, ARM::VST2LNq32 };
-      unsigned QOpcodes1[] = { ARM::VST2LNq16odd, ARM::VST2LNq32odd };
-      return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo,
+                              ARM::VST2LNd32Pseudo };
+      unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst3lane: {
-      unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 };
-      unsigned QOpcodes0[] = { ARM::VST3LNq16, ARM::VST3LNq32 };
-      unsigned QOpcodes1[] = { ARM::VST3LNq16odd, ARM::VST3LNq32odd };
-      return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo,
+                              ARM::VST3LNd32Pseudo };
+      unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst4lane: {
-      unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 };
-      unsigned QOpcodes0[] = { ARM::VST4LNq16, ARM::VST4LNq32 };
-      unsigned QOpcodes1[] = { ARM::VST4LNq16odd, ARM::VST4LNq32odd };
-      return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo,
+                              ARM::VST4LNd32Pseudo };
+      unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
     }
     }
     break;
@@ -2240,18 +2839,18 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       break;
 
     case Intrinsic::arm_neon_vtbl2:
-      return SelectVTBL(N, false, 2, ARM::VTBL2);
+      return SelectVTBL(N, false, 2, ARM::VTBL2Pseudo);
     case Intrinsic::arm_neon_vtbl3:
-      return SelectVTBL(N, false, 3, ARM::VTBL3);
+      return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
     case Intrinsic::arm_neon_vtbl4:
-      return SelectVTBL(N, false, 4, ARM::VTBL4);
+      return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
 
     case Intrinsic::arm_neon_vtbx2:
-      return SelectVTBL(N, true, 2, ARM::VTBX2);
+      return SelectVTBL(N, true, 2, ARM::VTBX2Pseudo);
     case Intrinsic::arm_neon_vtbx3:
-      return SelectVTBL(N, true, 3, ARM::VTBX3);
+      return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
     case Intrinsic::arm_neon_vtbx4:
-      return SelectVTBL(N, true, 4, ARM::VTBX4);
+      return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
     }
     break;
   }
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ce4a2c9..1835ec0 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMAddressingModes.h"
+#include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMISelLowering.h"
 #include "ARMMachineFunctionInfo.h"
@@ -28,9 +29,11 @@
 #include "llvm/Function.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Type.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -41,6 +44,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -50,6 +54,7 @@
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 
 // This option should go away when tail calls fully work.
 static cl::opt<bool>
@@ -57,14 +62,7 @@ EnableARMTailCalls("arm-tail-calls", cl::Hidden,
   cl::desc("Generate tail calls (TEMPORARY OPTION)."),
   cl::init(false));
 
-// This option should go away when Machine LICM is smart enough to hoist a 
-// reg-to-reg VDUP.
-static cl::opt<bool>
-EnableARMVDUPsplat("arm-vdup-splat", cl::Hidden,
-  cl::desc("Generate VDUP for integer constant splats (TEMPORARY OPTION)."),
-  cl::init(false));
-
-static cl::opt<bool>
+cl::opt<bool>
 EnableARMLongCalls("arm-long-calls", cl::Hidden,
   cl::desc("Generate calls via indirect call instructions"),
   cl::init(false));
@@ -74,28 +72,6 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
-static cl::opt<bool>
-EnableARMCodePlacement("arm-code-placement", cl::Hidden,
-  cl::desc("Enable code placement pass for ARM"),
-  cl::init(false));
-
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                   CCValAssign::LocInfo &LocInfo,
-                                   ISD::ArgFlagsTy &ArgFlags,
-                                   CCState &State);
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                    CCValAssign::LocInfo &LocInfo,
-                                    ISD::ArgFlagsTy &ArgFlags,
-                                    CCState &State);
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                      CCValAssign::LocInfo &LocInfo,
-                                      ISD::ArgFlagsTy &ArgFlags,
-                                      CCState &State);
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                       CCValAssign::LocInfo &LocInfo,
-                                       ISD::ArgFlagsTy &ArgFlags,
-                                       CCState &State);
-
 void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
                                        EVT PromotedBitwiseVT) {
   if (VT != PromotedLdStVT) {
@@ -111,8 +87,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
   EVT ElemTy = VT.getVectorElementType();
   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
     setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
-  if (ElemTy == MVT::i8 || ElemTy == MVT::i16)
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
   if (ElemTy != MVT::i32) {
     setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand);
@@ -122,7 +97,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
   setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Legal);
   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
   if (VT.isInteger()) {
@@ -131,6 +106,10 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
     setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
     setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand);
     setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand);
+    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction(VT.getSimpleVT(),
+                          (MVT::SimpleValueType)InnerVT, Expand);
   }
   setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
 
@@ -177,6 +156,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     : TargetLowering(TM, createTLOF(TM)) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   RegInfo = TM.getRegisterInfo();
+  Itins = TM.getInstrItineraryData();
 
   if (Subtarget->isTargetDarwin()) {
     // Uses VFP for Thumb libfuncs if available.
@@ -260,13 +240,157 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setLibcallName(RTLIB::SRL_I128, 0);
   setLibcallName(RTLIB::SRA_I128, 0);
 
-  // Libcalls should use the AAPCS base standard ABI, even if hard float
-  // is in effect, as per the ARM RTABI specification, section 4.1.2.
   if (Subtarget->isAAPCS_ABI()) {
-    for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
-      setLibcallCallingConv(static_cast<RTLIB::Libcall>(i),
-                            CallingConv::ARM_AAPCS);
-    }
+    // Double-precision floating-point arithmetic helper functions
+    // RTABI chapter 4.1.2, Table 2
+    setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
+    setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
+    setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
+    setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
+    setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
+
+    // Double-precision floating-point comparison helper functions
+    // RTABI chapter 4.1.2, Table 3
+    setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
+    setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
+    setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
+    setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
+    setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
+    setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
+    setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
+    setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
+    setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
+    setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
+    setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
+    setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
+    setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
+    setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
+    setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
+    setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
+    setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
+
+    // Single-precision floating-point arithmetic helper functions
+    // RTABI chapter 4.1.2, Table 4
+    setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
+    setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
+    setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
+    setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
+    setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
+
+    // Single-precision floating-point comparison helper functions
+    // RTABI chapter 4.1.2, Table 5
+    setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
+    setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+    setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
+    setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
+    setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
+    setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+    setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
+    setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+    setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
+    setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+    setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
+    setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+    setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
+    setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
+    setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
+    setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
+    setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
+
+    // Floating-point to integer conversions.
+    // RTABI chapter 4.1.2, Table 6
+    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
+    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
+    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
+    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
+    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
+
+    // Conversions between floating types.
+    // RTABI chapter 4.1.2, Table 7
+    setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
+    setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
+    setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
+
+    // Integer to floating-point conversions.
+    // RTABI chapter 4.1.2, Table 8
+    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
+    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
+    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
+    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
+    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
+    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
+    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
+    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
+    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
+
+    // Long long helper functions
+    // RTABI chapter 4.2, Table 9
+    setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
+    setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
+    setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
+    setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
+    setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
+    setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
+
+    // Integer division functions
+    // RTABI chapter 4.3.1
+    setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
+    setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
+    setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
+    setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
+    setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
+    setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
+    setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
   }
 
   if (Subtarget->isThumb1Only())
@@ -330,9 +454,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+    // Custom handling for some vector types to avoid expensive expansions
+    setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+    setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
     setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
     setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
 
+    setTargetDAGCombine(ISD::INTRINSIC_VOID);
+    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRL);
@@ -341,6 +472,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setTargetDAGCombine(ISD::ZERO_EXTEND);
     setTargetDAGCombine(ISD::ANY_EXTEND);
     setTargetDAGCombine(ISD::SELECT_CC);
+    setTargetDAGCombine(ISD::BUILD_VECTOR);
+    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::STORE);
   }
 
   computeRegisterProperties();
@@ -397,7 +532,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
   // These are expanded into libcalls.
-  if (!Subtarget->hasDivide()) {
+  if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) {
     // v7M has a hardware divider
     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
     setOperationAction(ISD::UDIV,  MVT::i32, Expand);
@@ -423,14 +558,15 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
   setOperationAction(ISD::EHSELECTION,        MVT::i32,   Expand);
-  // FIXME: Shouldn't need this, since no register is used, but the legalizer
-  // doesn't yet know how to not do that for SjLj.
-  setExceptionSelectorRegister(ARM::R0);
+  setOperationAction(ISD::EXCEPTIONADDR,      MVT::i32,   Expand);
+  setExceptionPointerRegister(ARM::R0);
+  setExceptionSelectorRegister(ARM::R1);
+
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
   if (Subtarget->hasDataBarrier() ||
-      (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())) {
+      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
     // membarrier needs custom lowering; the rest are legal and handled
     // normally.
     setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
@@ -474,6 +610,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
   setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
 
+  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
+
   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
   if (!Subtarget->hasV6Ops()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -484,7 +622,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
     // iff target supports vfp2.
-    setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom);
+    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
   }
 
@@ -493,6 +631,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (Subtarget->isTargetDarwin()) {
     setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
     setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+    setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom);
   }
 
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
@@ -547,8 +686,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::MUL);
 
-  if (Subtarget->hasV6T2Ops())
+  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON())
     setTargetDAGCombine(ISD::OR);
+  if (Subtarget->hasNEON())
+    setTargetDAGCombine(ISD::AND);
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
@@ -557,16 +698,26 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   else
     setSchedulingPreference(Sched::Hybrid);
 
-  maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
+  //// temporary - rewrite interface to use type
+  maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
 
   // On ARM arguments smaller than 4 bytes are extended, so all arguments
   // are at least 4 bytes aligned.
   setMinStackArgumentAlignment(4);
 
-  if (EnableARMCodePlacement)
-    benefitFromCodePlacementOpt = true;
+  benefitFromCodePlacementOpt = true;
 }
 
+// FIXME: It might make sense to define the representative register class as the
+// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
+// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
+// SPR's representative would be DPR_VFP2. This should work well if register
+// pressure tracking were modified such that a register use would increment the
+// pressure of the register class's representative and all of it's super
+// classes' representatives transitively. We have not implemented this because
+// of the difficulty prior to coalescing of modeling operand register classes
+// due to the common occurence of cross class copies and subregister insertions
+// and extractions.
 std::pair<const TargetRegisterClass*, uint8_t>
 ARMTargetLowering::findRepresentativeClass(EVT VT) const{
   const TargetRegisterClass *RRC = 0;
@@ -580,6 +731,12 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{
   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
     RRC = ARM::DPRRegisterClass;
+    // When NEON is used for SP, only half of the register file is available
+    // because operations that define both SP and DP results will be constrained
+    // to the VFP2 class (D0-D15). We currently model this constraint prior to
+    // coalescing by double-counting the SP regs. See the FIXME above.
+    if (Subtarget->useNEONForSinglePrecisionFP())
+      Cost = 2;
     break;
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
@@ -602,6 +759,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default: return 0;
   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
+  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
   case ARMISD::CALL:          return "ARMISD::CALL";
   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
@@ -612,7 +771,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
-  case ARMISD::AND:           return "ARMISD::AND";
   case ARMISD::CMP:           return "ARMISD::CMP";
   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
@@ -633,25 +791,33 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
   case ARMISD::RRX:           return "ARMISD::RRX";
 
-  case ARMISD::VMOVRRD:         return "ARMISD::VMOVRRD";
-  case ARMISD::VMOVDRR:         return "ARMISD::VMOVDRR";
+  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
+  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
 
   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
   case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
+  case ARMISD::EH_SJLJ_DISPATCHSETUP:return "ARMISD::EH_SJLJ_DISPATCHSETUP";
 
   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
-  
+
   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
 
   case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
-  case ARMISD::SYNCBARRIER:   return "ARMISD::SYNCBARRIER";
+  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
+
+  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
 
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
+  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
   case ARMISD::VCGE:          return "ARMISD::VCGE";
+  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
+  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
   case ARMISD::VCGT:          return "ARMISD::VCGT";
+  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
+  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
   case ARMISD::VTST:          return "ARMISD::VTST";
 
@@ -693,6 +859,28 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
   case ARMISD::BFI:           return "ARMISD::BFI";
+  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
+  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
+  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
+  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
+  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
+  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
+  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
+  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
+  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
+  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
+  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
+  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
+  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
+  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
+  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
+  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
+  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
+  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
+  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
+  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
+  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
+  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
   }
 }
 
@@ -735,6 +923,8 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
 
   for (unsigned i = 0; i != NumVals; ++i) {
     EVT VT = N->getValueType(i);
+    if (VT == MVT::Glue || VT == MVT::Other)
+      continue;
     if (VT.isFloatingPoint() || VT.isVector())
       return Sched::Latency;
   }
@@ -746,25 +936,29 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
   // is not available.
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
-  if (TID.mayLoad())
-    return Sched::Latency;
 
-  const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData();
-  if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2)
+  if (TID.getNumDefs() == 0)
+    return Sched::RegPressure;
+  if (!Itins->isEmpty() &&
+      Itins->getOperandCycle(TID.getSchedClass(), 0) > 2)
     return Sched::Latency;
+
   return Sched::RegPressure;
 }
 
+// FIXME: Move to RegInfo
 unsigned
 ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
                                        MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
   switch (RC->getID()) {
   default:
     return 0;
   case ARM::tGPRRegClassID:
-    return RegInfo->hasFP(MF) ? 4 : 5;
+    return TFI->hasFP(MF) ? 4 : 5;
   case ARM::GPRRegClassID: {
-    unsigned FP = RegInfo->hasFP(MF) ? 1 : 0;
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
     return 10 - FP - (Subtarget->isR9Reserved() ? 1 : 0);
   }
   case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
@@ -829,136 +1023,6 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
 
 #include "ARMGenCallingConv.inc"
 
-// APCS f64 is in register pairs, possibly split to stack
-static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                          CCValAssign::LocInfo &LocInfo,
-                          CCState &State, bool CanFail) {
-  static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
-
-  // Try to get the first register.
-  if (unsigned Reg = State.AllocateReg(RegList, 4))
-    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-  else {
-    // For the 2nd half of a v2f64, do not fail.
-    if (CanFail)
-      return false;
-
-    // Put the whole thing on the stack.
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(8, 4),
-                                           LocVT, LocInfo));
-    return true;
-  }
-
-  // Try to get the second register.
-  if (unsigned Reg = State.AllocateReg(RegList, 4))
-    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-  else
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(4, 4),
-                                           LocVT, LocInfo));
-  return true;
-}
-
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                   CCValAssign::LocInfo &LocInfo,
-                                   ISD::ArgFlagsTy &ArgFlags,
-                                   CCState &State) {
-  if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
-    return false;
-  if (LocVT == MVT::v2f64 &&
-      !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
-    return false;
-  return true;  // we handled it
-}
-
-// AAPCS f64 is in aligned register pairs
-static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                           CCValAssign::LocInfo &LocInfo,
-                           CCState &State, bool CanFail) {
-  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
-  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
-  static const unsigned ShadowRegList[] = { ARM::R0, ARM::R1 };
-
-  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
-  if (Reg == 0) {
-    // For the 2nd half of a v2f64, do not just fail.
-    if (CanFail)
-      return false;
-
-    // Put the whole thing on the stack.
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(8, 8),
-                                           LocVT, LocInfo));
-    return true;
-  }
-
-  unsigned i;
-  for (i = 0; i < 2; ++i)
-    if (HiRegList[i] == Reg)
-      break;
-
-  unsigned T = State.AllocateReg(LoRegList[i]);
-  (void)T;
-  assert(T == LoRegList[i] && "Could not allocate register");
-
-  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
-                                         LocVT, LocInfo));
-  return true;
-}
-
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                    CCValAssign::LocInfo &LocInfo,
-                                    ISD::ArgFlagsTy &ArgFlags,
-                                    CCState &State) {
-  if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
-    return false;
-  if (LocVT == MVT::v2f64 &&
-      !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
-    return false;
-  return true;  // we handled it
-}
-
-static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                         CCValAssign::LocInfo &LocInfo, CCState &State) {
-  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
-  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
-
-  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
-  if (Reg == 0)
-    return false; // we didn't handle it
-
-  unsigned i;
-  for (i = 0; i < 2; ++i)
-    if (HiRegList[i] == Reg)
-      break;
-
-  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
-                                         LocVT, LocInfo));
-  return true;
-}
-
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                      CCValAssign::LocInfo &LocInfo,
-                                      ISD::ArgFlagsTy &ArgFlags,
-                                      CCState &State) {
-  if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
-    return false;
-  if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
-    return false;
-  return true;  // we handled it
-}
-
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
-                                       CCValAssign::LocInfo &LocInfo,
-                                       ISD::ArgFlagsTy &ArgFlags,
-                                       CCState &State) {
-  return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
-                                   State);
-}
-
 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
 /// given CallingConvention value.
 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
@@ -967,23 +1031,29 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
   switch (CC) {
   default:
     llvm_unreachable("Unsupported calling convention");
-  case CallingConv::C:
   case CallingConv::Fast:
+    if (Subtarget->hasVFP2() && !isVarArg) {
+      if (!Subtarget->isAAPCS_ABI())
+        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+      // For AAPCS ABI targets, just use VFP variant of the calling convention.
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    }
+    // Fallthrough
+  case CallingConv::C: {
     // Use target triple & subtarget features to do actual dispatch.
-    if (Subtarget->isAAPCS_ABI()) {
-      if (Subtarget->hasVFP2() &&
-          FloatABIType == FloatABI::Hard && !isVarArg)
-        return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
-      else
-        return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
-    } else
-        return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+    if (!Subtarget->isAAPCS_ABI())
+      return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+    else if (Subtarget->hasVFP2() &&
+             FloatABIType == FloatABI::Hard && !isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  }
   case CallingConv::ARM_AAPCS_VFP:
-    return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
   case CallingConv::ARM_AAPCS:
-    return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
   case CallingConv::ARM_APCS:
-    return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
   }
 }
 
@@ -1050,7 +1120,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val);
+      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
       break;
     }
 
@@ -1073,7 +1143,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                        /*isVolatile=*/false, /*AlwaysInline=*/false,
-                       NULL, 0, NULL, 0);
+                       MachinePointerInfo(0), MachinePointerInfo(0));
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack.
@@ -1086,11 +1156,11 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
-  if (Flags.isByVal()) {
+  if (Flags.isByVal())
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
-  }
+
   return DAG.getStore(Chain, dl, Arg, PtrOff,
-                      PseudoSourceValue::getStack(), LocMemOffset,
+                      MachinePointerInfo::getStack(LocMemOffset),
                       false, false, 0);
 }
 
@@ -1198,7 +1268,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
       break;
     case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
       break;
     }
 
@@ -1289,7 +1359,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
       const GlobalValue *GV = G->getGlobal();
       // Create a constant pool entry for the callee address
-      unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
                                                            ARMPCLabelIndex,
                                                            ARMCP::CPValue, 0);
@@ -1298,13 +1368,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
                            DAG.getEntryNode(), CPAddr,
-                           PseudoSourceValue::getConstantPool(), 0,
+                           MachinePointerInfo::getConstantPool(),
                            false, false, 0);
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
       // Create a constant pool entry for the callee address
-      unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
                                                        Sym, ARMPCLabelIndex, 0);
       // Get the address of the callee into a register
@@ -1312,7 +1382,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
                            DAG.getEntryNode(), CPAddr,
-                           PseudoSourceValue::getConstantPool(), 0,
+                           MachinePointerInfo::getConstantPool(),
                            false, false, 0);
     }
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
@@ -1326,7 +1396,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
     // tBX takes a register source operand.
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
-      unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
                                                            ARMPCLabelIndex,
                                                            ARMCP::CPValue, 4);
@@ -1334,13 +1404,19 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
                            DAG.getEntryNode(), CPAddr,
-                           PseudoSourceValue::getConstantPool(), 0,
+                           MachinePointerInfo::getConstantPool(),
                            false, false, 0);
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
                            getPointerTy(), Callee, PICLabel);
-    } else
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+    } else {
+      // On ELF targets for PIC code, direct calls should go through the PLT
+      unsigned OpFlags = 0;
+      if (Subtarget->isTargetELF() &&
+                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
+        OpFlags = ARMII::MO_PLT;
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+    }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
     bool isStub = Subtarget->isTargetDarwin() &&
@@ -1349,20 +1425,26 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // tBX takes a register source operand.
     const char *Sym = S->getSymbol();
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
-      unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
                                                        Sym, ARMPCLabelIndex, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
                            DAG.getEntryNode(), CPAddr,
-                           PseudoSourceValue::getConstantPool(), 0,
+                           MachinePointerInfo::getConstantPool(),
                            false, false, 0);
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
                            getPointerTy(), Callee, PICLabel);
-    } else
-      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+    } else {
+      unsigned OpFlags = 0;
+      // On ELF targets for PIC code, direct calls should go through the PLT
+      if (Subtarget->isTargetELF() &&
+                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
+        OpFlags = ARMII::MO_PLT;
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
+    }
   }
 
   // FIXME: handle tail calls differently.
@@ -1391,7 +1473,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   if (isTailCall)
     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
 
@@ -1421,7 +1503,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
-    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
@@ -1490,32 +1572,15 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // LR.  This means if we need to reload LR, it takes an extra instructions,
   // which outweighs the value of the tail call; but here we don't know yet
   // whether LR is going to be used.  Probably the right approach is to
-  // generate the tail call here and turn it back into CALL/RET in 
+  // generate the tail call here and turn it back into CALL/RET in
   // emitEpilogue if LR is used.
-  if (Subtarget->isThumb1Only())
-    return false;
-
-  // For the moment, we can only do this to functions defined in this
-  // compilation, or to indirect calls.  A Thumb B to an ARM function,
-  // or vice versa, is not easily fixed up in the linker unlike BL.
-  // (We could do this by loading the address of the callee into a register;
-  // that is an extra instruction over the direct call and burns a register
-  // as well, so is not likely to be a win.)
-
-  // It might be safe to remove this restriction on non-Darwin.
 
   // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
   // but we need to make sure there are enough registers; the only valid
   // registers are the 4 used for parameters.  We don't currently do this
   // case.
-  if (isa<ExternalSymbolSDNode>(Callee))
-      return false;
-
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    if (GV->isDeclaration() || GV->isWeakForLinker())
-      return false;
-  }
+  if (Subtarget->isThumb1Only())
+    return false;
 
   // If the calling conventions do not match, then we'd better make sure the
   // results are returned in the same way as what the caller expects.
@@ -1583,7 +1648,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
           if (!VA.isRegLoc())
             return false;
           if (!ArgLocs[++i].isRegLoc())
-            return false; 
+            return false;
           if (RegVT == MVT::v2f64) {
             if (!ArgLocs[++i].isRegLoc())
               return false;
@@ -1643,7 +1708,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
       break;
     }
 
@@ -1693,6 +1758,61 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   return result;
 }
 
+bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  unsigned NumCopies = 0;
+  SDNode* Copies[2];
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() == ISD::CopyToReg) {
+    Copies[NumCopies++] = Use;
+  } else if (Use->getOpcode() == ARMISD::VMOVRRD) {
+    // f64 returned in a pair of GPRs.
+    for (SDNode::use_iterator UI = Use->use_begin(), UE = Use->use_end();
+         UI != UE; ++UI) {
+      if (UI->getOpcode() != ISD::CopyToReg)
+        return false;
+      Copies[UI.getUse().getResNo()] = *UI;
+      ++NumCopies;
+    }
+  } else if (Use->getOpcode() == ISD::BITCAST) {
+    // f32 returned in a single GPR.
+    if (!Use->hasNUsesOfValue(1, 0))
+      return false;
+    Use = *Use->use_begin();
+    if (Use->getOpcode() != ISD::CopyToReg || !Use->hasNUsesOfValue(1, 0))
+      return false;
+    Copies[NumCopies++] = Use;
+  } else {
+    return false;
+  }
+
+  if (NumCopies != 1 && NumCopies != 2)
+    return false;
+
+  bool HasRet = false;
+  for (unsigned i = 0; i < NumCopies; ++i) {
+    SDNode *Copy = Copies[i];
+    for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+         UI != UE; ++UI) {
+      if (UI->getOpcode() == ISD::CopyToReg) {
+        SDNode *Use = *UI;
+        if (Use == Copies[0] || Use == Copies[1])
+          continue;
+        return false;
+      }
+      if (UI->getOpcode() != ARMISD::RET_FLAG)
+        return false;
+      HasRet = true;
+    }
+  }
+
+  return HasRet;
+}
+
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
@@ -1732,7 +1852,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
-    ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+    ARMPCLabelIndex = AFI->createPICLabelUId();
     ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex,
                                                          ARMCP::CPBlockAddress,
                                                          PCAdj);
@@ -1740,7 +1860,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
   SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
-                               PseudoSourceValue::getConstantPool(), 0,
+                               MachinePointerInfo::getConstantPool(),
                                false, false, 0);
   if (RelocM == Reloc::Static)
     return Result;
@@ -1757,14 +1877,14 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   ARMConstantPoolValue *CPV =
     new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                             ARMCP::CPValue, PCAdj, "tlsgd", true);
+                             ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
   Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
-                         PseudoSourceValue::getConstantPool(), 0,
+                         MachinePointerInfo::getConstantPool(),
                          false, false, 0);
   SDValue Chain = Argument.getValue(1);
 
@@ -1802,16 +1922,16 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
   if (GV->isDeclaration()) {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-    unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
     // Initial exec model.
     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMConstantPoolValue *CPV =
       new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                               ARMCP::CPValue, PCAdj, "gottpoff", true);
+                               ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         PseudoSourceValue::getConstantPool(), 0,
+                         MachinePointerInfo::getConstantPool(),
                          false, false, 0);
     Chain = Offset.getValue(1);
 
@@ -1819,15 +1939,15 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
 
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         PseudoSourceValue::getConstantPool(), 0,
+                         MachinePointerInfo::getConstantPool(),
                          false, false, 0);
   } else {
     // local exec model
-    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff");
+    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMCP::TPOFF);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         PseudoSourceValue::getConstantPool(), 0,
+                         MachinePointerInfo::getConstantPool(),
                          false, false, 0);
   }
 
@@ -1859,51 +1979,72 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   if (RelocM == Reloc::PIC_) {
     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT");
+      new ARMConstantPoolValue(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                                  CPAddr,
-                                 PseudoSourceValue::getConstantPool(), 0,
+                                 MachinePointerInfo::getConstantPool(),
                                  false, false, 0);
     SDValue Chain = Result.getValue(1);
     SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
     if (!UseGOTOFF)
       Result = DAG.getLoad(PtrVT, dl, Chain, Result,
-                           PseudoSourceValue::getGOT(), 0,
-                           false, false, 0);
+                           MachinePointerInfo::getGOT(), false, false, 0);
     return Result;
+  }
+
+  // If we have T2 ops, we can materialize the address directly via movt/movw
+  // pair. This is always cheaper.
+  if (Subtarget->useMovt()) {
+    ++NumMovwMovt;
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes.
+    return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, dl, PtrVT));
   } else {
-    // If we have T2 ops, we can materialize the address directly via movt/movw
-    // pair. This is always cheaper.
-    if (Subtarget->useMovt()) {
-      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
-                         DAG.getTargetGlobalAddress(GV, dl, PtrVT));
-    } else {
-      SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
-      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                         PseudoSourceValue::getConstantPool(), 0,
-                         false, false, 0);
-    }
+    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, 0);
   }
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                     SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned ARMPCLabelIndex = 0;
   EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  if (Subtarget->useMovt()) {
+    ++NumMovwMovt;
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes.
+    if (RelocM == Reloc::Static)
+      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+
+    unsigned Wrapper = (RelocM == Reloc::PIC_)
+      ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
+    SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
+                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+    if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+      Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+                           MachinePointerInfo::getGOT(), false, false, 0);
+    return Result;
+  }
+
+  unsigned ARMPCLabelIndex = 0;
   SDValue CPAddr;
-  if (RelocM == Reloc::Static)
+  if (RelocM == Reloc::Static) {
     CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
-  else {
-    ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+  } else {
+    ARMPCLabelIndex = AFI->createPICLabelUId();
     unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
     ARMConstantPoolValue *CPV =
       new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
@@ -1912,7 +2053,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
 
   SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                               PseudoSourceValue::getConstantPool(), 0,
+                               MachinePointerInfo::getConstantPool(),
                                false, false, 0);
   SDValue Chain = Result.getValue(1);
 
@@ -1922,8 +2063,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   }
 
   if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
-    Result = DAG.getLoad(PtrVT, dl, Chain, Result,
-                         PseudoSourceValue::getGOT(), 0,
+    Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(),
                          false, false, 0);
 
   return Result;
@@ -1935,7 +2075,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
          "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
@@ -1945,13 +2085,21 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
   SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                               PseudoSourceValue::getConstantPool(), 0,
+                               MachinePointerInfo::getConstantPool(),
                                false, false, 0);
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
   return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
 }
 
 SDValue
+ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG)
+  const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Val = DAG.getConstant(0, MVT::i32);
@@ -1980,7 +2128,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-    unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
     EVT PtrVT = getPointerTy();
     DebugLoc dl = Op.getDebugLoc();
     Reloc::Model RelocM = getTargetMachine().getRelocationModel();
@@ -1994,7 +2142,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result =
       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                  PseudoSourceValue::getConstantPool(), 0,
+                  MachinePointerInfo::getConstantPool(),
                   false, false, 0);
 
     if (RelocM == Reloc::PIC_) {
@@ -2009,21 +2157,55 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
                                const ARMSubtarget *Subtarget) {
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Op5 = Op.getOperand(5);
-  unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
-  // Some subtargets which have dmb and dsb instructions can handle barriers
-  // directly. Some ARMv6 cpus can support them with the help of mcr
-  // instruction. Thumb1 and pre-v6 ARM mode use a libcall instead and should
-  // never get here.
-  unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER;
-  if (Subtarget->hasDataBarrier())
-    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0));
-  else {
-    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb1Only() &&
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
            "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
-    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0),
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
                        DAG.getConstant(0, MVT::i32));
   }
+
+  SDValue Op5 = Op.getOperand(5);
+  bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0;
+  unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0);
+
+  ARM_MB::MemBOpt DMBOpt;
+  if (isDeviceBarrier)
+    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY;
+  else
+    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH;
+  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(DMBOpt, MVT::i32));
+}
+
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
+                             const ARMSubtarget *Subtarget) {
+  // ARM pre v5TE and Thumb1 does not have preload instructions.
+  if (!(Subtarget->isThumb2() ||
+        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+  if (!isRead &&
+      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
+    // ARMv7 with MP extension has PLDW.
+    return Op.getOperand(0);
+
+  if (Subtarget->isThumb())
+    // Invert the bits.
+    isRead = ~isRead & 1;
+  unsigned isData = Subtarget->isThumb() ? 0 : 1;
+
+  // Currently there is no intrinsic that matches pli.
+  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
+                     DAG.getConstant(isData, MVT::i32));
 }
 
 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
@@ -2036,8 +2218,8 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
-                      false, false, 0);
+  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
 }
 
 SDValue
@@ -2054,7 +2236,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     RC = ARM::GPRRegisterClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
-  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
 
   SDValue ArgValue2;
@@ -2065,10 +2247,10 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     // Create load node to retrieve arguments from the stack.
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
     ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
-                            PseudoSourceValue::getFixedStack(FI), 0,
+                            MachinePointerInfo::getFixedStack(FI),
                             false, false, 0);
   } else {
-    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC, dl);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
   }
 
@@ -2119,7 +2301,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
             SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
-                                    PseudoSourceValue::getFixedStack(FI), 0,
+                                    MachinePointerInfo::getFixedStack(FI),
                                     false, false, 0);
           } else {
             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
@@ -2149,7 +2331,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
         // Transform the arguments in physical registers into virtual ones.
-        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
@@ -2160,7 +2342,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       default: llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full: break;
       case CCValAssign::BCvt:
-        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::SExt:
         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
@@ -2188,7 +2370,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
       InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
-                                   PseudoSourceValue::getFixedStack(FI), 0,
+                                   MachinePointerInfo::getFixedStack(FI),
                                    false, false, 0));
     }
   }
@@ -2202,7 +2384,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
     unsigned NumGPRs = CCInfo.getFirstUnallocated
       (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
 
-    unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+    unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
     unsigned VARegSize = (4 - NumGPRs) * 4;
     unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
     unsigned ArgOffset = CCInfo.getNextStackOffset();
@@ -2214,7 +2396,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       AFI->setVarArgsFrameIndex(
         MFI->CreateFixedObject(VARegSaveSize,
                                ArgOffset + VARegSaveSize - VARegSize,
-                               true));
+                               false));
       SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
                                       getPointerTy());
 
@@ -2226,12 +2408,12 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         else
           RC = ARM::GPRRegisterClass;
 
-        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
+        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC, dl);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
-               PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()),
-               0, false, false, 0);
+               MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
+                       false, false, 0);
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
                           DAG.getConstant(4, getPointerTy()));
@@ -2320,7 +2502,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     break;
   }
   ARMcc = DAG.getConstant(CondCode, MVT::i32);
-  return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS);
+  return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
 }
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
@@ -2329,10 +2511,10 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
                              DebugLoc dl) const {
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
-    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS);
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
   else
-    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Flag, LHS);
-  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp);
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -2444,8 +2626,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
     return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
-                       Ld->getChain(), Ld->getBasePtr(),
-                       Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
                        Ld->isVolatile(), Ld->isNonTemporal(),
                        Ld->getAlignment());
 
@@ -2464,7 +2645,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
     SDValue Ptr = Ld->getBasePtr();
     RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
                           Ld->getChain(), Ptr,
-                          Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                          Ld->getPointerInfo(),
                           Ld->isVolatile(), Ld->isNonTemporal(),
                           Ld->getAlignment());
 
@@ -2474,7 +2655,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
                                  PtrType, Ptr, DAG.getConstant(4, PtrType));
     RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
                           Ld->getChain(), NewPtr,
-                          Ld->getSrcValue(), Ld->getSrcValueOffset() + 4,
+                          Ld->getPointerInfo().getWithOffset(4),
                           Ld->isVolatile(), Ld->isNonTemporal(),
                           NewAlign);
     return;
@@ -2524,7 +2705,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
     expandf64Toi32(RHS, DAG, RHS1, RHS2);
     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
     ARMcc = DAG.getConstant(CondCode, MVT::i32);
-    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
   }
@@ -2564,7 +2745,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
   if (CondCode2 != ARMCC::AL) {
@@ -2599,14 +2780,14 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   }
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
     Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
-                       PseudoSourceValue::getJumpTable(), 0,
+                       MachinePointerInfo::getJumpTable(),
                        false, false, 0);
     Chain = Addr.getValue(1);
     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
   } else {
     Addr = DAG.getLoad(PTy, dl, Chain, Addr,
-                       PseudoSourceValue::getJumpTable(), 0, false, false, 0);
+                       MachinePointerInfo::getJumpTable(), false, false, 0);
     Chain = Addr.getValue(1);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
   }
@@ -2627,7 +2808,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
     break;
   }
   Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
-  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
 }
 
 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2646,7 +2827,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
     break;
   }
 
-  Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0));
+  Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
   return DAG.getNode(Opc, dl, VT, Op);
 }
 
@@ -2657,12 +2838,46 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
-  SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
-  SDValue ARMcc = DAG.getConstant(ARMCC::LT, MVT::i32);
-  SDValue FP0 = DAG.getConstantFP(0.0, SrcVT);
-  SDValue Cmp = getVFPCmp(Tmp1, FP0, DAG, dl);
+  bool F2IisFast = Subtarget->isCortexA9() ||
+    Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR;
+
+  // Bitcast operand 1 to i32.
+  if (SrcVT == MVT::f64)
+    Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                       &Tmp1, 1).getValue(1);
+  Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
+
+  // If float to int conversion isn't going to be super expensive, then simply
+  // or in the signbit.
+  if (F2IisFast) {
+    SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
+    SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
+    Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+    if (VT == MVT::f32) {
+      Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                         DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
+    }
+
+    // f64: Or the high part with signbit and then combine two parts.
+    Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                       &Tmp0, 1);
+    SDValue Lo = Tmp0.getValue(0);
+    SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+    Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+  }
+
+  // Remove the signbit of operand 0.
+  Tmp0 = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
+
+  // If operand 1 signbit is one, then negate operand 0.
+  SDValue ARMcc;
+  SDValue Cmp = getARMCmp(Tmp1, DAG.getConstant(0, MVT::i32),
+                          ISD::SETLT, ARMcc, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMcc, CCR, Cmp);
+  return DAG.getNode(ARMISD::CNEG, dl, VT, Tmp0, Tmp0, ARMcc, CCR, Cmp);
 }
 
 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
@@ -2678,11 +2893,11 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
     SDValue Offset = DAG.getConstant(4, MVT::i32);
     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
-                       NULL, 0, false, false, 0);
+                       MachinePointerInfo(), false, false, 0);
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32), dl);
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
@@ -2697,17 +2912,18 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
     ? ARM::R7 : ARM::R11;
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
-    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
                             false, false, 0);
   return FrameAddr;
 }
 
-/// ExpandBIT_CONVERT - If the target supports VFP, this function is called to
+/// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
 /// vectors), since the legalizer won't know what to do with that.
-static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   DebugLoc dl = N->getDebugLoc();
   SDValue Op = N->getOperand(0);
@@ -2717,7 +2933,7 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
   assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
-         "ExpandBIT_CONVERT called for non-i64 type");
+         "ExpandBITCAST called for non-i64 type");
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
@@ -2725,7 +2941,7 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
                              DAG.getConstant(0, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(1, MVT::i32));
-    return DAG.getNode(ISD::BIT_CONVERT, dl, DstVT,
+    return DAG.getNode(ISD::BITCAST, dl, DstVT,
                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
   }
 
@@ -2752,7 +2968,7 @@ static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
-  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
 }
 
 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
@@ -2825,7 +3041,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
-SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 
+SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                             SelectionDAG &DAG) const {
   // The rounding mode is in bits 23:22 of the FPSCR.
   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
@@ -2835,11 +3051,11 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
                               DAG.getConstant(Intrinsic::arm_get_fpscr,
                                               MVT::i32));
-  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 
+  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
                                   DAG.getConstant(1U << 22, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
                               DAG.getConstant(22, MVT::i32));
-  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 
+  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
                      DAG.getConstant(3, MVT::i32));
 }
 
@@ -2860,33 +3076,40 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   DebugLoc dl = N->getDebugLoc();
 
+  if (!VT.isVector())
+    return SDValue();
+
   // Lower vector shifts on NEON to use VSHL.
-  if (VT.isVector()) {
-    assert(ST->hasNEON() && "unexpected vector shift");
-
-    // Left shifts translate directly to the vshiftu intrinsic.
-    if (N->getOpcode() == ISD::SHL)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
-                         N->getOperand(0), N->getOperand(1));
-
-    assert((N->getOpcode() == ISD::SRA ||
-            N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
-
-    // NEON uses the same intrinsics for both left and right shifts.  For
-    // right shifts, the shift amounts are negative, so negate the vector of
-    // shift amounts.
-    EVT ShiftVT = N->getOperand(1).getValueType();
-    SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
-                                       getZeroVector(ShiftVT, DAG, dl),
-                                       N->getOperand(1));
-    Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
-                               Intrinsic::arm_neon_vshifts :
-                               Intrinsic::arm_neon_vshiftu);
+  assert(ST->hasNEON() && "unexpected vector shift");
+
+  // Left shifts translate directly to the vshiftu intrinsic.
+  if (N->getOpcode() == ISD::SHL)
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(vshiftInt, MVT::i32),
-                       N->getOperand(0), NegatedCount);
-  }
+                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
+                       N->getOperand(0), N->getOperand(1));
+
+  assert((N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+
+  // NEON uses the same intrinsics for both left and right shifts.  For
+  // right shifts, the shift amounts are negative, so negate the vector of
+  // shift amounts.
+  EVT ShiftVT = N->getOperand(1).getValueType();
+  SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
+                                     getZeroVector(ShiftVT, DAG, dl),
+                                     N->getOperand(1));
+  Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
+                             Intrinsic::arm_neon_vshifts :
+                             Intrinsic::arm_neon_vshiftu);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(vshiftInt, MVT::i32),
+                     N->getOperand(0), NegatedCount);
+}
+
+static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
+                                const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
 
   // We can get here for a node like i32 = ISD::SHL i32, i64
   if (VT != MVT::i64)
@@ -2912,7 +3135,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
   // captures the result into a carry flag.
   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
-  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Flag), &Hi, 1);
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
 
   // The low part is an ARMISD::RRX operand, which shifts the carry in.
   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
@@ -2998,13 +3221,13 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
         AndOp = Op1;
 
       // Ignore bitconvert.
-      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT)
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
         AndOp = AndOp.getOperand(0);
 
       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
         Opc = ARMISD::VTST;
-        Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0));
-        Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1));
+        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
         Invert = !Invert;
       }
     }
@@ -3013,7 +3236,38 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   if (Swap)
     std::swap(Op0, Op1);
 
-  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+  // If one of the operands is a constant vector zero, attempt to fold the
+  // comparison to a specialized compare-against-zero form.
+  SDValue SingleOp;
+  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+    SingleOp = Op0;
+  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
+    if (Opc == ARMISD::VCGE)
+      Opc = ARMISD::VCLEZ;
+    else if (Opc == ARMISD::VCGT)
+      Opc = ARMISD::VCLTZ;
+    SingleOp = Op1;
+  }
+
+  SDValue Result;
+  if (SingleOp.getNode()) {
+    switch (Opc) {
+    case ARMISD::VCEQ:
+      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGE:
+      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLEZ:
+      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGT:
+      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLTZ:
+      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
+    default:
+      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+    }
+  } else {
+     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+  }
 
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
@@ -3026,7 +3280,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
 /// operand (e.g., VMOV).  If so, return the encoded value.
 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
-                                 EVT &VT, bool is128Bits, bool isVMOV) {
+                                 EVT &VT, bool is128Bits, NEONModImmType type) {
   unsigned OpCmode, Imm;
 
   // SplatBitSize is set to the smallest size that splats the vector, so a
@@ -3039,7 +3293,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
 
   switch (SplatBitSize) {
   case 8:
-    if (!isVMOV)
+    if (type != VMOVModImm)
       return SDValue();
     // Any 1-byte value is OK.  Op=0, Cmode=1110.
     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
@@ -3096,6 +3350,9 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       break;
     }
 
+    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
+    if (type == OtherModImm) return SDValue();
+
     if ((SplatBits & ~0xffff) == 0 &&
         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
       // Value = 0x0000nnff: Op=x, Cmode=1100.
@@ -3122,7 +3379,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
     return SDValue();
 
   case 64: {
-    if (!isVMOV)
+    if (type != VMOVModImm)
       return SDValue();
     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
     uint64_t BitMask = 0xff;
@@ -3376,8 +3633,8 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
 
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
-static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 
-                                 const ARMSubtarget *ST) {
+SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                                             const ARMSubtarget *ST) const {
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
@@ -3391,10 +3648,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       EVT VmovVT;
       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VmovVT, VT.is128BitVector(), true);
+                                      DAG, VmovVT, VT.is128BitVector(),
+                                      VMOVModImm);
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
-        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
       }
 
       // Try an immediate VMVN.
@@ -3402,10 +3660,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                              ((1LL << SplatBitSize) - 1));
       Val = isNEONModifiedImm(NegatedImm,
                                       SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VmovVT, VT.is128BitVector(), false);
+                                      DAG, VmovVT, VT.is128BitVector(),
+                                      VMVNModImm);
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
-        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
       }
     }
   }
@@ -3439,26 +3698,25 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
 
-  if (EnableARMVDUPsplat) {
-    // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
-    // i32 and try again.
-    if (usesOnlyOneValue && EltSize <= 32) {
-      if (!isConstant)
-        return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
-      if (VT.getVectorElementType().isFloatingPoint()) {
-        SmallVector<SDValue, 8> Ops;
-        for (unsigned i = 0; i < NumElts; ++i)
-          Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 
-                                    Op.getOperand(i)));
-        SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &Ops[0],
-                                  NumElts);
-        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 
-                           LowerBUILD_VECTOR(Val, DAG, ST));
-      }
-      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue && EltSize <= 32) {
+    if (!isConstant)
+      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+                                  Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
+      Val = LowerBUILD_VECTOR(Val, DAG, ST);
       if (Val.getNode())
-        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
+    SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+    if (Val.getNode())
+      return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
   }
 
   // If all elements are constants and the case above didn't get hit, fall back
@@ -3467,10 +3725,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   if (isConstant)
     return SDValue();
 
-  if (!EnableARMVDUPsplat) {
-    // Use VDUP for non-constant splats.
-    if (usesOnlyOneValue && EltSize <= 32)
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
   }
 
   // Vectors with 32- or 64-bit elements can be built by directly assigning
@@ -3483,14 +3742,144 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i)
-      Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Op.getOperand(i)));
+      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
   return SDValue();
 }
 
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    }
+
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
+
+  SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = {0, 0};
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    if (SourceVecs[i].getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = SourceVecs[i];
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+      // It probably isn't worth padding out a smaller vector just to
+      // break it down again in a shuffle.
+      return SDValue();
+    }
+
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
+
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i],
+                                   DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i],
+                                   DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i],
+                                     DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i],
+                                     DAG.getIntPtrConstant(NumElts));
+      ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(VEXTOffsets[i], MVT::i32));
+    }
+  }
+
+  SmallVector<int, 8> Mask;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
+                                          .getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+    }
+  }
+
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
+  return SDValue();
+}
+
 /// isShuffleMaskLegal - Targets can use this to indicate that they only
 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
@@ -3706,8 +4095,8 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     // registers are defined to use, and since i64 is not legal.
     EVT EltVT = EVT::getFloatingPointVT(EltSize);
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
-    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1);
-    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i) {
       if (ShuffleMask[i] < 0)
@@ -3719,21 +4108,26 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
                                                   MVT::i32)));
     }
     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
   return SDValue();
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Vec = Op.getOperand(0);
+  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
   SDValue Lane = Op.getOperand(1);
-  assert(VT == MVT::i32 &&
-         Vec.getValueType().getVectorElementType().getSizeInBits() < 32 &&
-         "unexpected type for custom-lowering vector extract");
-  return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  SDValue Vec = Op.getOperand(0);
+  if (Op.getValueType() == MVT::i32 &&
+      Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
+    DebugLoc dl = Op.getDebugLoc();
+    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  }
+
+  return Op;
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
@@ -3747,25 +4141,123 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDValue Op1 = Op.getOperand(1);
   if (Op0.getOpcode() != ISD::UNDEF)
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
-                      DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0),
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
                       DAG.getIntPtrConstant(0));
   if (Op1.getOpcode() != ISD::UNDEF)
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
-                      DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1),
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
                       DAG.getIntPtrConstant(1));
-  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);
+  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
+}
+
+/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
+/// element has been zero/sign-extended, depending on the isSigned parameter,
+/// from an integer type half its size.
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    if (BVN->getValueType(0) != MVT::v4i32 ||
+        BVN->getOpcode() != ISD::BUILD_VECTOR)
+      return false;
+    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    unsigned HiElt = 1 - LoElt;
+    ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
+    ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
+    ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
+    ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
+    if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
+      return false;
+    if (isSigned) {
+      if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
+          Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
+        return true;
+    } else {
+      if (Hi0->isNullValue() && Hi1->isNullValue())
+        return true;
+    }
+    return false;
+  }
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        int64_t SExtVal = C->getSExtValue();
+        if ((SExtVal >> HalfSize) != (SExtVal >> EltSize))
+          return false;
+      } else {
+        if ((C->getZExtValue() >> HalfSize) != 0)
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+/// isSignExtended - Check if a node is a vector value that is sign-extended
+/// or a constant BUILD_VECTOR with sign-extended elements.
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+/// isZeroExtended - Check if a node is a vector value that is zero-extended
+/// or a constant BUILD_VECTOR with zero-extended elements.
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
 }
 
-/// SkipExtension - For a node that is either a SIGN_EXTEND, ZERO_EXTEND, or
-/// an extending load, return the unextended value.
+/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending
+/// load, or BUILD_VECTOR with extended elements, return the unextended value.
 static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
     return N->getOperand(0);
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
-                     LD->getBasePtr(), LD->getSrcValue(),
-                     LD->getSrcValueOffset(), LD->isVolatile(),
-                     LD->isNonTemporal(), LD->getAlignment());
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
+                       LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
+                       LD->isNonTemporal(), LD->getAlignment());
+  // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
+  // have been legalized as a BITCAST from v4i32.
+  if (N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
+           BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
+    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32,
+                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
+  }
+  // Construct a new BUILD_VECTOR with elements truncated to half the size.
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
 }
 
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
@@ -3776,19 +4268,16 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   SDNode *N0 = Op.getOperand(0).getNode();
   SDNode *N1 = Op.getOperand(1).getNode();
   unsigned NewOpc = 0;
-  if ((N0->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N0)) &&
-      (N1->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N1))) {
+  if (isSignExtended(N0, DAG) && isSignExtended(N1, DAG))
     NewOpc = ARMISD::VMULLs;
-  } else if ((N0->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N0)) &&
-             (N1->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N1))) {
+  else if (isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG))
     NewOpc = ARMISD::VMULLu;
-  } else if (VT.getSimpleVT().SimpleTy == MVT::v2i64) {
+  else if (VT == MVT::v2i64)
     // Fall through to expand this.  It is not legal.
     return SDValue();
-  } else {
+  else
     // Other vector multiplications are legal.
     return Op;
-  }
 
   // Legalize to a VMULL instruction.
   DebugLoc DL = Op.getDebugLoc();
@@ -3801,6 +4290,181 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
 }
 
+static SDValue 
+LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
+  // Convert to float
+  // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
+  // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
+  X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
+  Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
+  X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
+  Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
+  // Get reciprocal estimate.
+  // float4 recip = vrecpeq_f32(yf);
+  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
+  // Because char has a smaller range than uchar, we can actually get away
+  // without any newton steps.  This requires that we use a weird bias
+  // of 0xb000, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
+  X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
+  Y = DAG.getConstant(0xb000, MVT::i32);
+  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
+  X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
+  // Convert back to short.
+  X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
+  X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
+  return X;
+}
+
+static SDValue 
+LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
+  SDValue N2;
+  // Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_s16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_s16(x));
+  N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+  
+  // Use reciprocal estimate and one refinement step.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   N1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Because short has a smaller range than ushort, we can actually get away
+  // with only a single newton step.  This requires that we use a weird bias
+  // of 89, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 89);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(89, MVT::i32);
+  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_s32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::SDIV");
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+  
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
+    
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4)); 
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0));
+
+    N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
+
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+    
+    N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
+    return N0;
+  }
+  return LowerSDIV_v4i16(N0, N1, dl, DAG);
+}
+
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::UDIV");
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+  
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
+    
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4)); 
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0));
+    
+    N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
+    
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+    
+    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 
+                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
+                     N0);
+    return N0;
+  }
+  
+  // v4i16 sdiv ... Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_u16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_u16(x));
+  N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+  // Use reciprocal estimate and two refinement steps.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   N1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   N1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Simply multiplying by the reciprocal estimate can leave us a few ulps
+  // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
+  // and that it will never cause us to return an answer too large).
+  // float4 result = as_float4(as_int4(xf*recip) + 89);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(2, MVT::i32);
+  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_u32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -3816,6 +4480,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG);
   case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
+  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:
@@ -3826,9 +4491,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                Subtarget);
-  case ISD::BIT_CONVERT:   return ExpandBIT_CONVERT(Op.getNode(), DAG);
+  case ISD::BITCAST:   return ExpandBITCAST(Op.getNode(), DAG);
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
@@ -3843,6 +4509,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
+  case ISD::SDIV:          return LowerSDIV(Op, DAG);
+  case ISD::UDIV:          return LowerUDIV(Op, DAG);
   }
   return SDValue();
 }
@@ -3857,12 +4525,12 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   default:
     llvm_unreachable("Don't know how to custom expand this!");
     break;
-  case ISD::BIT_CONVERT:
-    Res = ExpandBIT_CONVERT(N, DAG);
+  case ISD::BITCAST:
+    Res = ExpandBITCAST(N, DAG);
     break;
   case ISD::SRL:
   case ISD::SRA:
-    Res = LowerShift(N, DAG, Subtarget);
+    Res = Expand64BitShift(N, DAG, Subtarget);
     break;
   }
   if (Res.getNode())
@@ -3892,7 +4560,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
   case 1:
     ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB;
+    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
     break;
   case 2:
     ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
@@ -4183,6 +4851,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   case ARM::BCCi64:
   case ARM::BCCZi64: {
+    // If there is an unconditional branch to the other successor, remove it.
+    BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+
     // Compare both parts that make up the double comparison separately for
     // equality.
     bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
@@ -4341,10 +5012,6 @@ static SDValue PerformMULCombine(SDNode *N,
   if (Subtarget->isThumb1Only())
     return SDValue();
 
-  if (DAG.getMachineFunction().
-      getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-    return SDValue();
-
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
@@ -4389,10 +5056,67 @@ static SDValue PerformMULCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformANDCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // Attempt to use immediate-form VBIC
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VbicVT;
+      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VbicVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
+        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const ARMSubtarget *Subtarget) {
+  // Attempt to use immediate-form VORR
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && Subtarget->hasNEON() &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VorrVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VorrVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
+        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
+      }
+    }
+  }
+
   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
   // reasonable.
 
@@ -4400,7 +5124,6 @@ static SDValue PerformORCombine(SDNode *N,
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
     return SDValue();
 
-  SelectionDAG &DAG = DCI.DAG;
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   DebugLoc DL = N->getDebugLoc();
   // 1) or (and A, mask), val => ARMbfi A, val, mask
@@ -4415,40 +5138,46 @@ static SDValue PerformORCombine(SDNode *N,
   if (N0.getOpcode() != ISD::AND)
     return SDValue();
 
-  EVT VT = N->getValueType(0);
   if (VT != MVT::i32)
     return SDValue();
 
+  SDValue N00 = N0.getOperand(0);
 
   // The value and the mask need to be constants so we can verify this is
   // actually a bitfield set. If the mask is 0xffff, we can do better
   // via a movt instruction, so don't use BFI in that case.
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-  if (!C)
+  SDValue MaskOp = N0.getOperand(1);
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
+  if (!MaskC)
     return SDValue();
-  unsigned Mask = C->getZExtValue();
+  unsigned Mask = MaskC->getZExtValue();
   if (Mask == 0xffff)
     return SDValue();
   SDValue Res;
   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
-  if ((C = dyn_cast<ConstantSDNode>(N1))) {
-    unsigned Val = C->getZExtValue();
-    if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N1C) {
+    unsigned Val = N1C->getZExtValue();
+    if ((Val & ~Mask) != Val)
       return SDValue();
-    Val >>= CountTrailingZeros_32(~Mask);
 
-    Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
-                      DAG.getConstant(Val, MVT::i32),
-                      DAG.getConstant(Mask, MVT::i32));
+    if (ARM::isBitFieldInvertedMask(Mask)) {
+      Val >>= CountTrailingZeros_32(~Mask);
 
-    // Do not add new nodes to DAG combiner worklist.
-    DCI.CombineTo(N, Res, false);
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
+                        DAG.getConstant(Val, MVT::i32),
+                        DAG.getConstant(Mask, MVT::i32));
+
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    }
   } else if (N1.getOpcode() == ISD::AND) {
     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
-    C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
-    if (!C)
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
       return SDValue();
-    unsigned Mask2 = C->getZExtValue();
+    unsigned Mask2 = N11C->getZExtValue();
 
     if (ARM::isBitFieldInvertedMask(Mask) &&
         ARM::isBitFieldInvertedMask(~Mask2) &&
@@ -4462,10 +5191,11 @@ static SDValue PerformORCombine(SDNode *N,
       unsigned lsb = CountTrailingZeros_32(Mask2);
       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
                         DAG.getConstant(lsb, MVT::i32));
-      Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res,
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
                         DAG.getConstant(Mask, MVT::i32));
       // Do not add new nodes to DAG combiner worklist.
       DCI.CombineTo(N, Res, false);
+      return SDValue();
     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
                ARM::isBitFieldInvertedMask(Mask2) &&
                (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) {
@@ -4476,40 +5206,472 @@ static SDValue PerformORCombine(SDNode *N,
         return SDValue();
       // 2b
       unsigned lsb = CountTrailingZeros_32(Mask);
-      Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
+      Res = DAG.getNode(ISD::SRL, DL, VT, N00,
                         DAG.getConstant(lsb, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
                                 DAG.getConstant(Mask2, MVT::i32));
       // Do not add new nodes to DAG combiner worklist.
       DCI.CombineTo(N, Res, false);
+      return SDValue();
     }
   }
 
+  if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
+      N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
+      ARM::isBitFieldInvertedMask(~Mask)) {
+    // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
+    // where lsb(mask) == #shamt and masked bits of B are known zero.
+    SDValue ShAmt = N00.getOperand(1);
+    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+    unsigned LSB = CountTrailingZeros_32(Mask);
+    if (ShAmtC != LSB)
+      return SDValue();
+
+    Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
+                      DAG.getConstant(~Mask, MVT::i32));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, Res, false);
+  }
+
+  return SDValue();
+}
+
+/// PerformBFICombine - (bfi A, (and B, C1), C2) -> (bfi A, B, C2) iff
+/// C1 & C2 == C1.
+static SDValue PerformBFICombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::AND) {
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
+      return SDValue();
+    unsigned Mask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned Mask2 = N11C->getZExtValue();
+    if ((Mask & Mask2) == Mask2)
+      return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
+                             N->getOperand(0), N1.getOperand(0),
+                             N->getOperand(2));
+  }
   return SDValue();
 }
 
 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
 /// ARMISD::VMOVRRD.
 static SDValue PerformVMOVRRDCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  // fmrrd(fmdrr x, y) -> x,y
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  // vmovrrd(vmovdrr x, y) -> x,y
   SDValue InDouble = N->getOperand(0);
   if (InDouble.getOpcode() == ARMISD::VMOVDRR)
     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
   return SDValue();
 }
 
+/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
+static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
+  // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() == ISD::BITCAST)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::BITCAST)
+    Op1 = Op1.getOperand(0);
+  if (Op0.getOpcode() == ARMISD::VMOVRRD &&
+      Op0.getNode() == Op1.getNode() &&
+      Op0.getResNo() == 0 && Op1.getResNo() == 1)
+    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+                       N->getValueType(0), Op0.getOperand(0));
+  return SDValue();
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  // Bitcast an i64 store extracted from a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  SDValue StVal = St->getValue();
+  if (!ISD::isNormalStore(St) || St->isVolatile() ||
+      StVal.getValueType() != MVT::i64 ||
+      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = StVal.getDebugLoc();
+  SDValue IntVec = StVal.getOperand(0);
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                 IntVec.getValueType().getVectorNumElements());
+  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
+  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                               Vec, StVal.getOperand(1));
+  dl = N->getDebugLoc();
+  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  DCI.AddToWorklist(ExtElt.getNode());
+  DCI.AddToWorklist(V.getNode());
+  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
+                      St->getPointerInfo(), St->isVolatile(),
+                      St->isNonTemporal(), St->getAlignment(),
+                      St->getTBAAInfo());
+}
+
+/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
+/// are normal, non-volatile loads.  If so, it is profitable to bitcast an
+/// i64 vector to have f64 elements, since the value can then be loaded
+/// directly into a VFP register.
+static bool hasNormalLoadOperand(SDNode *N) {
+  unsigned NumElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
+      return true;
+  }
+  return false;
+}
+
+/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
+/// ISD::BUILD_VECTOR.
+static SDValue PerformBUILD_VECTORCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI){
+  // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
+  // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
+  // into a pair of GPRs, which is fine when the value is used as a scalar,
+  // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
+  SelectionDAG &DAG = DCI.DAG;
+  if (N->getNumOperands() == 2) {
+    SDValue RV = PerformVMOVDRRCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  // Load i64 elements as f64 values so that type legalization does not split
+  // them up into i32 values.
+  EVT VT = N->getValueType(0);
+  if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
+    return SDValue();
+  DebugLoc dl = N->getDebugLoc();
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
+    Ops.push_back(V);
+    // Make the DAGCombiner fold the bitcast.
+    DCI.AddToWorklist(V.getNode());
+  }
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
+  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+/// PerformInsertEltCombine - Target-specific dag combine xforms for
+/// ISD::INSERT_VECTOR_ELT.
+static SDValue PerformInsertEltCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  // Bitcast an i64 load inserted into a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
+  EVT VT = N->getValueType(0);
+  SDNode *Elt = N->getOperand(1).getNode();
+  if (VT.getVectorElementType() != MVT::i64 ||
+      !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                 VT.getVectorNumElements());
+  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
+  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  DCI.AddToWorklist(V.getNode());
+  SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
+                               Vec, V, N->getOperand(2));
+  return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
+}
+
+/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
+/// ISD::VECTOR_SHUFFLE.
+static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
+  // The LLVM shufflevector instruction does not require the shuffle mask
+  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
+  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
+  // operands do not match the mask length, they are extended by concatenating
+  // them with undef vectors.  That is probably the right thing for other
+  // targets, but for NEON it is better to concatenate two double-register
+  // size vector operands into a single quad-register size vector.  Do that
+  // transformation here:
+  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
+  //   shuffle(concat(v1, v2), undef)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op0.getNumOperands() != 2 ||
+      Op1.getNumOperands() != 2)
+    return SDValue();
+  SDValue Concat0Op1 = Op0.getOperand(1);
+  SDValue Concat1Op1 = Op1.getOperand(1);
+  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
+      Concat1Op1.getOpcode() != ISD::UNDEF)
+    return SDValue();
+  // Skip the transformation if any of the types are illegal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+  if (!TLI.isTypeLegal(VT) ||
+      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
+      !TLI.isTypeLegal(Concat1Op1.getValueType()))
+    return SDValue();
+
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+                                  Op0.getOperand(0), Op1.getOperand(0));
+  // Translate the shuffle mask.
+  SmallVector<int, 16> NewMask;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfElts = NumElts/2;
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  for (unsigned n = 0; n < NumElts; ++n) {
+    int MaskElt = SVN->getMaskElt(n);
+    int NewElt = -1;
+    if (MaskElt < (int)HalfElts)
+      NewElt = MaskElt;
+    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
+      NewElt = HalfElts + MaskElt - NumElts;
+    NewMask.push_back(NewElt);
+  }
+  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
+                              DAG.getUNDEF(VT), NewMask.data());
+}
+
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
+/// NEON load/store intrinsics to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoad = true;
+    bool isLaneOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: assert(0 && "unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: assert(0 && "unexpected opcode for Neon base update");
+      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+      }
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoad)
+      VecTy = N->getValueType(0);
+    else 
+      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint64_t IncVal = CInc->getZExtValue();
+      if (IncVal != NumBytes)
+        continue;
+    } else if (NumBytes >= 3 * 16) {
+      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+      // separate instructions that make it harder to use a non-constant update.
+      continue;
+    }
+
+    // Create the new updating load/store node.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i32;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(AddrOpIdx));
+    Ops.push_back(Inc);
+    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
+                                           Ops.data(), Ops.size(),
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  } 
+  return SDValue();
+}
+
+/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
+/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
+/// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
+/// return true.
+static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  // vldN-dup instructions only support 64-bit vectors for N > 1.
+  if (!VT.is64BitVector())
+    return false;
+
+  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+  SDNode *VLD = N->getOperand(0).getNode();
+  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+  unsigned NumVecs = 0;
+  unsigned NewOpc = 0;
+  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_neon_vld2lane) {
+    NumVecs = 2;
+    NewOpc = ARMISD::VLD2DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+    NumVecs = 3;
+    NewOpc = ARMISD::VLD3DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+    NumVecs = 4;
+    NewOpc = ARMISD::VLD4DUP;
+  } else {
+    return false;
+  }
+
+  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+  // numbers match the load.
+  unsigned VLDLaneNo =
+    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    // Ignore uses of the chain result.
+    if (UI.getUse().getResNo() == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    if (User->getOpcode() != ARMISD::VDUPLANE ||
+        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+      return false;
+  }
+
+  // Create the vldN-dup node.
+  EVT Tys[5];
+  unsigned n;
+  for (n = 0; n < NumVecs; ++n)
+    Tys[n] = VT;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
+  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
+                                           Ops, 2, VLDMemInt->getMemoryVT(),
+                                           VLDMemInt->getMemOperand());
+
+  // Update the uses.
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    unsigned ResNo = UI.getUse().getResNo();
+    // Ignore uses of the chain result.
+    if (ResNo == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  }
+
+  // Now the vldN-lane intrinsic is dead except for its chain result.
+  // Update uses of the chain.
+  std::vector<SDValue> VLDDupResults;
+  for (unsigned n = 0; n < NumVecs; ++n)
+    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+  DCI.CombineTo(VLD, VLDDupResults);
+
+  return true;
+}
+
 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
 /// ARMISD::VDUPLANE.
 static SDValue PerformVDUPLANECombine(SDNode *N,
                                       TargetLowering::DAGCombinerInfo &DCI) {
-  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
-  // redundant.
   SDValue Op = N->getOperand(0);
-  EVT VT = N->getValueType(0);
 
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BIT_CONVERT)
+  // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
+  // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
+  if (CombineVLDDUP(N, DCI))
+    return SDValue(N, 0);
+
+  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
+  // redundant.  Ignore bit_converts for now; element sizes are checked below.
+  while (Op.getOpcode() == ISD::BITCAST)
     Op = Op.getOperand(0);
   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
     return SDValue();
@@ -4521,11 +5683,11 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   unsigned EltBits;
   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
     EltSize = 8;
+  EVT VT = N->getValueType(0);
   if (EltSize > VT.getVectorElementType().getSizeInBits())
     return SDValue();
 
-  SDValue Res = DCI.DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
-  return DCI.CombineTo(N, Res, false);
+  return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
 }
 
 /// getVShiftImm - Check if this is a valid build_vector for the immediate
@@ -4533,7 +5695,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
 /// build_vector must have the same constant integer value.
 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
   // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BIT_CONVERT)
+  while (Op.getOpcode() == ISD::BITCAST)
     Op = Op.getOperand(0);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   APInt SplatBits, SplatUndef;
@@ -4747,7 +5909,8 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
 
   // Nothing to be done for scalar shifts.
-  if (! VT.isVector())
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isVector() || !TLI.isTypeLegal(VT))
     return SDValue();
 
   assert(ST->hasNEON() && "unexpected vector shift");
@@ -4793,7 +5956,8 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
 
     if (VT == MVT::i32 &&
         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
-        TLI.isTypeLegal(Vec.getValueType())) {
+        TLI.isTypeLegal(Vec.getValueType()) &&
+        isa<ConstantSDNode>(Lane)) {
 
       unsigned Opc = 0;
       switch (N->getOpcode()) {
@@ -4906,7 +6070,14 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
+  case ISD::AND:        return PerformANDCombine(N, DCI);
+  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
+  case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
+  case ISD::STORE:      return PerformSTORECombine(N, DCI);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
+  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
+  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
   case ISD::SHL:
@@ -4916,20 +6087,42 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
   case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::VLD2DUP:
+  case ARMISD::VLD3DUP:
+  case ARMISD::VLD4DUP:
+    return CombineBaseUpdate(N, DCI);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+    case Intrinsic::arm_neon_vld2lane:
+    case Intrinsic::arm_neon_vld3lane:
+    case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+    case Intrinsic::arm_neon_vst2lane:
+    case Intrinsic::arm_neon_vst3lane:
+    case Intrinsic::arm_neon_vst4lane:
+      return CombineBaseUpdate(N, DCI);
+    default: break;
+    }
+    break;
   }
   return SDValue();
 }
 
-bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
-  if (!Subtarget->hasV6Ops())
-    // Pre-v6 does not support unaligned mem access.
-    return false;
+bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
+                                                          EVT VT) const {
+  return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
+}
 
-  // v6+ may or may not support unaligned mem access depending on the system
-  // configuration.
-  // FIXME: This is pretty conservative. Should we provide cmdline option to
-  // control the behaviour?
-  if (!Subtarget->isTargetDarwin())
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+  if (!Subtarget->allowsUnalignedMem())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
@@ -5143,7 +6336,7 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   if (!Subtarget->isThumb())
     return ARM_AM::getSOImmVal(Imm) != -1;
   if (Subtarget->isThumb2())
-    return ARM_AM::getT2SOImmVal(Imm) != -1; 
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
   return Imm >= 0 && Imm <= 255;
 }
 
@@ -5348,6 +6541,37 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
 //                           ARM Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
+bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  // Looking for "rev" which is V6+.
+  if (!Subtarget->hasV6Ops())
+    return false;
+
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::string AsmStr = IA->getAsmString();
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t,");
+
+    // rev $0, $1
+    if (AsmPieces.size() == 3 &&
+        AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
+        IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
+      const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      if (Ty && Ty->getBitWidth() == 32)
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  }
+
+  return false;
+}
+
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
@@ -5362,6 +6586,40 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
   return TargetLowering::getConstraintType(Constraint);
 }
 
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ARMTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  const Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'l':
+    if (type->isIntegerTy()) {
+      if (Subtarget->isThumb())
+        weight = CW_SpecificReg;
+      else
+        weight = CW_Register;
+    }
+    break;
+  case 'w':
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
 std::pair<unsigned, const TargetRegisterClass*>
 ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
                                                 EVT VT) const {
@@ -5664,3 +6922,63 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
     return ARM::getVFPf64Imm(Imm) != -1;
   return false;
 }
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      const Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
index ba9ea7f..dc400c4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -34,6 +34,10 @@ namespace llvm {
 
       Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
                     // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperDYN,   // WrapperDYN - A wrapper node for TargetGlobalAddress in
+                    // DYN mode.
+      WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
+                    // PIC mode.
       WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
 
       CALL,         // Function call.
@@ -47,8 +51,6 @@ namespace llvm {
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
-      AND,          // ARM "and" instruction that sets the 's' flag in CPSR.
-
       CMP,          // ARM compare instructions.
       CMPZ,         // ARM compare that sets only Z flag.
       CMPFP,        // ARM VFP compare instruction, sets FPSCR.
@@ -73,8 +75,9 @@ namespace llvm {
       VMOVRRD,      // double to two gprs.
       VMOVDRR,      // Two gprs to double.
 
-      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp.
-      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp.
+      EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
+      EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
+      EH_SJLJ_DISPATCHSETUP,  // SjLj exception handling dispatch setup.
 
       TC_RETURN,    // Tail call return pseudo.
 
@@ -82,13 +85,20 @@ namespace llvm {
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
 
-      MEMBARRIER,   // Memory barrier
-      SYNCBARRIER,  // Memory sync barrier
+      MEMBARRIER,   // Memory barrier (DMB)
+      MEMBARRIER_MCR, // Memory barrier (MCR)
+
+      PRELOAD,      // Preload
       
       VCEQ,         // Vector compare equal.
+      VCEQZ,        // Vector compare equal to zero.
       VCGE,         // Vector compare greater than or equal.
+      VCGEZ,        // Vector compare greater than or equal to zero.
+      VCLEZ,        // Vector compare less than or equal to zero.
       VCGEU,        // Vector compare unsigned greater than or equal.
       VCGT,         // Vector compare greater than.
+      VCGTZ,        // Vector compare greater than zero.
+      VCLTZ,        // Vector compare less than zero.
       VCGTU,        // Vector compare unsigned greater than.
       VTST,         // Vector test bits.
 
@@ -161,7 +171,38 @@ namespace llvm {
       FMIN,
 
       // Bit-field insert
-      BFI
+      BFI,
+      
+      // Vector OR with immediate
+      VORRIMM,
+      // Vector AND with NOT of immediate
+      VBICIMM,
+
+      // Vector load N-element structure to all lanes:
+      VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      VLD3DUP,
+      VLD4DUP,
+
+      // NEON loads with post-increment base updates:
+      VLD1_UPD,
+      VLD2_UPD,
+      VLD3_UPD,
+      VLD4_UPD,
+      VLD2LN_UPD,
+      VLD3LN_UPD,
+      VLD4LN_UPD,
+      VLD2DUP_UPD,
+      VLD3DUP_UPD,
+      VLD4DUP_UPD,
+
+      // NEON stores with post-increment base updates:
+      VST1_UPD,
+      VST2_UPD,
+      VST3_UPD,
+      VST4_UPD,
+      VST2LN_UPD,
+      VST3LN_UPD,
+      VST4LN_UPD
     };
   }
 
@@ -193,14 +234,16 @@ namespace llvm {
     virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                                     SelectionDAG &DAG) const;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     virtual MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const;
 
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const;
+
     /// allowsUnalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses. of the specified type.
     /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON?
@@ -241,7 +284,15 @@ namespace llvm {
                                                 unsigned Depth) const;
 
 
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+
     ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
                                    EVT VT) const;
@@ -290,6 +341,9 @@ namespace llvm {
     /// materialize the FP immediate as a load from a constant pool.
     virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
 
+    virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                    const CallInst &I,
+                                    unsigned Intrinsic) const;
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(EVT VT) const;
@@ -301,6 +355,8 @@ namespace llvm {
 
     const TargetRegisterInfo *RegInfo;
 
+    const InstrItineraryData *Itins;
+
     /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
     ///
     unsigned ARMPCLabelIndex;
@@ -329,6 +385,7 @@ namespace llvm {
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -350,6 +407,10 @@ namespace llvm {
     SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 
+                              const ARMSubtarget *ST) const;
+
+    SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
@@ -393,6 +454,8 @@ namespace llvm {
                   const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
+    virtual bool isUsedByReturnOnly(SDNode *N) const;
+
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS,
@@ -410,6 +473,13 @@ namespace llvm {
 
   };
   
+  enum NEONModImmType {
+    VMOVModImm,
+    VMVNModImm,
+    OtherModImm
+  };
+  
+  
   namespace ARM {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo);
   }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
index 113cfff..765cba4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1,4 +1,4 @@
-//===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=//
+//===- ARMInstrFormats.td - ARM Instruction Formats ----------*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -71,7 +71,7 @@ def NVTBLFrm      : Format<41>;
 
 // Misc flags.
 
-// the instruction has a Rn register operand.
+// The instruction has an Rn register operand.
 // UnaryDP - Indicates this is a unary data processing instruction, i.e.
 // it doesn't have a Rn operand.
 class UnaryDP    { bit isUnaryDataProc = 1; }
@@ -84,9 +84,10 @@ class Xform16Bit { bit canXformTo16Bit = 1; }
 // ARM Instruction flags.  These need to match ARMBaseInstrInfo.h.
 //
 
+// FIXME: Once the JIT is MC-ized, these can go away.
 // Addressing mode.
-class AddrMode<bits<4> val> {
-  bits<4> Value = val;
+class AddrMode<bits<5> val> {
+  bits<5> Value = val;
 }
 def AddrModeNone    : AddrMode<0>;
 def AddrMode1       : AddrMode<1>;
@@ -104,6 +105,7 @@ def AddrModeT2_i8   : AddrMode<12>;
 def AddrModeT2_so   : AddrMode<13>;
 def AddrModeT2_pc   : AddrMode<14>;
 def AddrModeT2_i8s4 : AddrMode<15>;
+def AddrMode_i12    : AddrMode<16>;
 
 // Instruction size.
 class SizeFlagVal<bits<3> val> {
@@ -134,7 +136,6 @@ def NeonDomain    : Domain<2>; // Instructions in Neon domain only
 def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
 
 //===----------------------------------------------------------------------===//
-
 // ARM special operands.
 //
 
@@ -143,6 +144,39 @@ def CondCodeOperand : AsmOperandClass {
   let SuperClasses = [];
 }
 
+def CCOutOperand : AsmOperandClass {
+  let Name = "CCOut";
+  let SuperClasses = [];
+}
+
+def MemBarrierOptOperand : AsmOperandClass {
+  let Name = "MemBarrierOpt";
+  let SuperClasses = [];
+  let ParserMethod = "tryParseMemBarrierOptOperand";
+}
+
+def ProcIFlagsOperand : AsmOperandClass {
+  let Name = "ProcIFlags";
+  let SuperClasses = [];
+  let ParserMethod = "tryParseProcIFlagsOperand";
+}
+
+def MSRMaskOperand : AsmOperandClass {
+  let Name = "MSRMask";
+  let SuperClasses = [];
+  let ParserMethod = "tryParseMSRMaskOperand";
+}
+
+// ARM imod and iflag operands, used only by the CPS instruction.
+def imod_op : Operand<i32> {
+  let PrintMethod = "printCPSIMod";
+}
+
+def iflags_op : Operand<i32> {
+  let PrintMethod = "printCPSIFlag";
+  let ParserMatchClass = ProcIFlagsOperand;
+}
+
 // ARM Predicate operand. Default to 14 = always (AL). Second part is CC
 // register whose default is 0 (no register).
 def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
@@ -153,16 +187,23 @@ def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
 
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
 }
 
 // Same as cc_out except it defaults to setting CPSR.
 def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+  let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
 }
 
 // ARM special operands for disassembly only.
 //
+def setend_op : Operand<i32> {
+  let PrintMethod = "printSetendOperand";
+}
 
 def cps_opt : Operand<i32> {
   let PrintMethod = "printCPSOptionOperand";
@@ -170,6 +211,7 @@ def cps_opt : Operand<i32> {
 
 def msr_mask : Operand<i32> {
   let PrintMethod = "printMSRMaskOperand";
+  let ParserMatchClass = MSRMaskOperand;
 }
 
 // A8.6.117, A8.6.118.  Different instructions are generated for #0 and #-0.
@@ -179,7 +221,6 @@ def neg_zero : Operand<i32> {
 }
 
 //===----------------------------------------------------------------------===//
-
 // ARM Instruction templates.
 //
 
@@ -198,14 +239,17 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
   bit isUnaryDataProc = 0;
   bit canXformTo16Bit = 0;
 
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
   // The layout of TSFlags should be kept in sync with ARMBaseInstrInfo.h.
-  let TSFlags{3-0}   = AM.Value;
-  let TSFlags{6-4}   = SZ.Value;
-  let TSFlags{8-7}   = IndexModeBits;
-  let TSFlags{14-9}  = Form;
-  let TSFlags{15}    = isUnaryDataProc;
-  let TSFlags{16}    = canXformTo16Bit;
-  let TSFlags{18-17} = D.Value;
+  let TSFlags{4-0}   = AM.Value;
+  let TSFlags{7-5}   = SZ.Value;
+  let TSFlags{9-8}   = IndexModeBits;
+  let TSFlags{15-10} = Form;
+  let TSFlags{16}    = isUnaryDataProc;
+  let TSFlags{17}    = canXformTo16Bit;
+  let TSFlags{19-18} = D.Value;
 
   let Constraints = cstr;
   let Itinerary = itin;
@@ -225,25 +269,51 @@ class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im,
                 Format f, Domain d, string cstr, InstrItinClass itin>
   : InstTemplate<am, sz, im, f, d, cstr, itin>;
 
-class PseudoInst<dag oops, dag iops, InstrItinClass itin,
-                 string asm, list<dag> pattern>
+class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
+  // FIXME: This really should derive from InstTemplate instead, as pseudos
+  //        don't need encoding information. TableGen doesn't like that
+  //        currently. Need to figure out why and fix it.
   : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain,
             "", itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
-  let AsmString = asm;
   let Pattern = pattern;
 }
 
+// PseudoInst that's ARM-mode only.
+class ARMPseudoInst<dag oops, dag iops, SizeFlagVal sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let SZ = sz;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// PseudoInst that's Thumb-mode only.
+class tPseudoInst<dag oops, dag iops, SizeFlagVal sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let SZ = sz;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// PseudoInst that's Thumb2-mode only.
+class t2PseudoInst<dag oops, dag iops, SizeFlagVal sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let SZ = sz;
+  list<Predicate> Predicates = [IsThumb2];
+}
 // Almost all ARM instructions are predicable.
 class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
         IndexMode im, Format f, InstrItinClass itin,
         string opc, string asm, string cstr,
         list<dag> pattern>
   : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsARM];
 }
@@ -270,9 +340,14 @@ class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
          string opc, string asm, string cstr,
          list<dag> pattern>
   : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p; // Predicate operand
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{31-28} = p;
+  let Inst{20} = s;
+
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
-  let AsmString = !strconcat(opc, !strconcat("${p}${s}", asm));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsARM];
 }
@@ -319,10 +394,6 @@ class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
        asm, "", pattern> {
   let Inst{27-24} = opcod;
 }
-class ABXIx2<dag oops, dag iops, InstrItinClass itin,
-             string asm, list<dag> pattern>
-  : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, Pseudo, itin,
-       asm, "", pattern>;
 
 // BR_JT instructions
 class JTI<dag oops, dag iops, InstrItinClass itin,
@@ -335,19 +406,42 @@ class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
+  bits<4> Rt;
+  bits<4> Rn;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt;
   let Inst{11-0}  = 0b111110011111;
 }
 class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<4> Rn;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
   let Inst{11-4}  = 0b11111001;
+  let Inst{3-0}   = Rt;
+}
+class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
+  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, [$Rn]", pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> Rn;
+  let Inst{27-23} = 0b00010;
+  let Inst{22} = b;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt;
+  let Inst{11-4} = 0b00001001;
+  let Inst{3-0} = Rt2;
 }
 
 // addrmode1 instructions
@@ -372,387 +466,125 @@ class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24-21} = opcod;
   let Inst{27-26} = 0b00;
 }
-class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin,
-            string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern>;
-
-
-// addrmode2 loads and stores
-class AI2<dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{27-26} = 0b01;
-}
 
 // loads
-class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
-              string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
-              string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-
-// stores
-class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
-              string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
-              string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
 
-// Pre-indexed loads
-class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-
-// Pre-indexed stores
-class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+// LDR/LDRB/STR/STRB/...
+class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am,
+             Format f, InstrItinClass itin, string opc, string asm,
+             list<dag> pattern>
+  : I<oops, iops, am, Size4Bytes, IndexModeNone, f, itin, opc, asm,
+      "", pattern> {
+  let Inst{27-25} = op;
+  let Inst{24} = 1;  // 24 == P
+  // 23 == U
+  let Inst{22} = isByte;
+  let Inst{21} = 0;  // 21 == W
+  let Inst{20} = isLd;
+}
+// Indexed load/stores
+class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, im, f, itin,
       opc, asm, cstr, pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-26} = 0b01;
-}
-
-// Post-indexed loads
-class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-26} = 0b01;
-}
-
-// Post-indexed stores
-class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 0; // B bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-26} = 0b01;
-}
-class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = 1; // B bit
-  let Inst{24}    = 0; // P bit
+  bits<4> Rt;
   let Inst{27-26} = 0b01;
+  let Inst{24}    = isPre; // P bit
+  let Inst{22}    = isByte; // B bit
+  let Inst{21}    = isPre; // W bit
+  let Inst{20}    = isLd; // L bit
+  let Inst{15-12} = Rt;
+}
+class AI2stridx<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {13}     1 == Rm, 0 == imm12
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = offset{13};
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = offset{11-0};
 }
 
 // addrmode3 instructions
-class AI3<dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern>;
-class AXI3<dag oops, dag iops, Format f, InstrItinClass itin,
-           string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern>;
-
-// loads
-class AI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AXI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
-              string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
-}
-class AI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
-              string opc, string asm, list<dag> pattern>
+class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
+  bits<14> addr;
+  bits<4> Rt;
   let Inst{27-25} = 0b000;
-}
-class AXI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
-               string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
-}
-class AI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
-              string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 0; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+}
+
+class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, im, f, itin,
+      opc, asm, cstr, pattern> {
+  bits<4> Rt;
   let Inst{27-25} = 0b000;
-}
-class AXI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
-               string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 0; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
-}
-class AI3ldd<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 0; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
+  let Inst{24}    = isPre;        // P bit
+  let Inst{21}    = isPre;        // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{7-4}   = op;
+}
+class AI3stridx<bits<4> op, bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM3 store w/ two operands: (GPR, am3offset)
+  bits<14> offset;
+  bits<4> Rt;
+  bits<4> Rn;
   let Inst{27-25} = 0b000;
+  let Inst{23}    = offset{8};
+  let Inst{22}    = offset{9};
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = offset{7-4};  // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = offset{3-0};  // imm3_0/Rm
 }
 
 // stores
-class AI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
+class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AXI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
-              string asm, list<dag> pattern>
-  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-       asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
-}
-class AI3std<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 1; // P bit
+  bits<14> addr;
+  bits<4> Rt;
   let Inst{27-25} = 0b000;
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = 0;            // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
 }
 
-// Pre-indexed loads
-class AI3ldhpr<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3ldshpr<dag oops, dag iops, Format f, InstrItinClass itin,
-                string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin,
-                string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 0; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3lddpr<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 0; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-
-
 // Pre-indexed stores
 class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
@@ -781,60 +613,6 @@ class AI3stdpr<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{27-25} = 0b000;
 }
 
-// Post-indexed loads
-class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin,
-                string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin,
-                string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 0; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 1; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3lddpo<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 0; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
-}
-
 // Post-indexed stores
 class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
@@ -864,21 +642,17 @@ class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin,
 }
 
 // addrmode4 instructions
-class AXI4ld<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
-             string asm, string cstr, list<dag> pattern>
-  : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin,
-       asm, cstr, pattern> {
-  let Inst{20}    = 1; // L bit
-  let Inst{22}    = 0; // S bit
+class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
+           string asm, string cstr, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin, asm, cstr, pattern> {
+  bits<4>  p;
+  bits<16> regs;
+  bits<4>  Rn;
+  let Inst{31-28} = p;
   let Inst{27-25} = 0b100;
-}
-class AXI4st<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
-             string asm, string cstr, list<dag> pattern>
-  : XI<oops, iops, AddrMode4, Size4Bytes, im, f, itin,
-       asm, cstr, pattern> {
-  let Inst{20}    = 0; // L bit
   let Inst{22}    = 0; // S bit
-  let Inst{27-25} = 0b100;
+  let Inst{19-16} = Rn;
+  let Inst{15-0}  = regs;
 }
 
 // Unsigned multiply, multiply-accumulate instructions.
@@ -899,24 +673,65 @@ class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
 }
 
 // Most significant word multiply
-class AMul2I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
-             string opc, string asm, list<dag> pattern>
+class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+             InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
       opc, asm, "", pattern> {
-  let Inst{7-4}   = 0b1001;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{7-4}   = opc7_4;
   let Inst{20}    = 1;
   let Inst{27-21} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+// MSW multiple w/ Ra operand
+class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
 }
 
 // SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
-class AMulxyI<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
-              string opc, string asm, list<dag> pattern>
+class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
       opc, asm, "", pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
   let Inst{4}     = 0;
   let Inst{7}     = 1;
   let Inst{20}    = 0;
   let Inst{27-21} = opcod;
+  let Inst{6-5}   = bit6_5;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
+}
+
+// AMulxyI with Ra operand
+class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+// SMLAL*
+class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
 }
 
 // Extend instructions.
@@ -924,16 +739,47 @@ class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, itin,
       opc, asm, "", pattern> {
+  // All AExtI instructions have Rd and Rm register operands.
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
   let Inst{7-4}   = 0b0111;
+  let Inst{9-8}   = 0b00;
   let Inst{27-20} = opcod;
 }
 
 // Misc Arithmetic instructions.
-class AMiscA1I<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
-               string opc, string asm, list<dag> pattern>
+class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin,
       opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
   let Inst{27-20} = opcod;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+// PKH instructions
+class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<8> sh;
+  let Inst{27-20} = opcod;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = sh{7-3};
+  let Inst{6}     = tb;
+  let Inst{5-4}   = 0b01;
+  let Inst{3-0}   = Rm;
 }
 
 //===----------------------------------------------------------------------===//
@@ -950,12 +796,9 @@ class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
 }
 
 //===----------------------------------------------------------------------===//
-//
 // Thumb Instruction Format Definitions.
 //
 
-// TI - Thumb instruction.
-
 class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
              InstrItinClass itin, string asm, string cstr, list<dag> pattern>
   : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
@@ -966,6 +809,7 @@ class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   list<Predicate> Predicates = [IsThumb];
 }
 
+// TI - Thumb instruction.
 class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
   : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
 
@@ -986,6 +830,13 @@ class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
   let Inst{12}    = opcod3;
 }
 
+// Move to/from coprocessor instructions
+class T1Cop<dag oops, dag iops, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size4Bytes, NoItinerary, asm, "", pattern>,
+    Encoding, Requires<[IsThumb, HasV6]> {
+  let Inst{31-28} = 0b1110;
+}
+
 // BR_JT instructions
 class TJTI<dag oops, dag iops, InstrItinClass itin, string asm,
            list<dag> pattern>
@@ -999,7 +850,7 @@ class Thumb1I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let InOperandList = iops;
   let AsmString = asm;
   let Pattern = pattern;
-  list<Predicate> Predicates = [IsThumb1Only];
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
 }
 
 class T1I<dag oops, dag iops, InstrItinClass itin,
@@ -1008,9 +859,6 @@ class T1I<dag oops, dag iops, InstrItinClass itin,
 class T1Ix2<dag oops, dag iops, InstrItinClass itin,
             string asm, list<dag> pattern>
   : Thumb1I<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
-class T1JTI<dag oops, dag iops, InstrItinClass itin,
-            string asm, list<dag> pattern>
-  : Thumb1I<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
 // Two-address instructions
 class T1It<dag oops, dag iops, InstrItinClass itin,
@@ -1025,9 +873,9 @@ class Thumb1sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
   let OutOperandList = !con(oops, (outs s_cc_out:$s));
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
   let Pattern = pattern;
-  list<Predicate> Predicates = [IsThumb1Only];
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
 }
 
 class T1sI<dag oops, dag iops, InstrItinClass itin,
@@ -1038,7 +886,7 @@ class T1sI<dag oops, dag iops, InstrItinClass itin,
 class T1sIt<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
-             "$lhs = $dst", pattern>;
+             "$Rn = $Rdn", pattern>;
 
 // Thumb1 instruction that can be predicated.
 class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
@@ -1047,9 +895,9 @@ class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
-  list<Predicate> Predicates = [IsThumb1Only];
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
 }
 
 class T1pI<dag oops, dag iops, InstrItinClass itin,
@@ -1060,17 +908,8 @@ class T1pI<dag oops, dag iops, InstrItinClass itin,
 class T1pIt<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
-             "$lhs = $dst", pattern>;
+             "$Rn = $Rdn", pattern>;
 
-class T1pI1<dag oops, dag iops, InstrItinClass itin,
-            string opc, string asm, list<dag> pattern>
-  : Thumb1pI<oops, iops, AddrModeT1_1, Size2Bytes, itin, opc, asm, "", pattern>;
-class T1pI2<dag oops, dag iops, InstrItinClass itin,
-            string opc, string asm, list<dag> pattern>
-  : Thumb1pI<oops, iops, AddrModeT1_2, Size2Bytes, itin, opc, asm, "", pattern>;
-class T1pI4<dag oops, dag iops, InstrItinClass itin,
-            string opc, string asm, list<dag> pattern>
-  : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>;
 class T1pIs<dag oops, dag iops,
             InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>;
@@ -1099,7 +938,7 @@ class T1DataProcessing<bits<4> opcode> : Encoding16 {
 // A6.2.3 Special data instructions and branch and exchange encoding.
 class T1Special<bits<4> opcode> : Encoding16 {
   let Inst{15-10} = 0b010001;
-  let Inst{9-6} = opcode;
+  let Inst{9-6}   = opcode;
 }
 
 // A6.2.4 Load/store single data item encoding.
@@ -1107,12 +946,37 @@ class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
   let Inst{15-12} = opA;
   let Inst{11-9}  = opB;
 }
-class T1LdSt<bits<3> opB>     : T1LoadStore<0b0101, opB>;
-class T1LdSt4Imm<bits<3> opB> : T1LoadStore<0b0110, opB>; // Immediate, 4 bytes
-class T1LdSt1Imm<bits<3> opB> : T1LoadStore<0b0111, opB>; // Immediate, 1 byte
-class T1LdSt2Imm<bits<3> opB> : T1LoadStore<0b1000, opB>; // Immediate, 2 bytes
 class T1LdStSP<bits<3> opB>   : T1LoadStore<0b1001, opB>; // SP relative
 
+// Helper classes to encode Thumb1 loads and stores. For immediates, the
+// following bits are used for "opA" (see A6.2.4):
+//
+//   0b0110 => Immediate, 4 bytes
+//   0b1000 => Immediate, 2 bytes
+//   0b0111 => Immediate, 1 byte
+class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am,
+                     InstrItinClass itin, string opc, string asm,
+                     list<dag> pattern>
+  : Thumb1pI<oops, iops, am, Size2Bytes, itin, opc, asm, "", pattern>,
+    T1LoadStore<0b0101, opcode> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{8-6} = addr{5-3};    // Rm
+  let Inst{5-3} = addr{2-0};    // Rn
+  let Inst{2-0} = Rt;
+}
+class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : Thumb1pI<oops, iops, am, Size2Bytes, itin, opc, asm, "", pattern>,
+    T1LoadStore<opA, {opB,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-6} = addr{7-3};   // imm5
+  let Inst{5-3}  = addr{2-0};   // Rn
+  let Inst{2-0}  = Rt;
+}
+
 // A6.2.5 Miscellaneous 16-bit instructions encoding.
 class T1Misc<bits<7> opcode> : Encoding16 {
   let Inst{15-12} = 0b1011;
@@ -1126,7 +990,7 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
 }
@@ -1134,16 +998,19 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 // Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
 // input operand since by default it's a zero register. It will become an
 // implicit def once it's "flipped".
-// 
+//
 // FIXME: This uses unified syntax so {s} comes before {p}. We should make it
 // more consistent.
 class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
                InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
   : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{20} = s;
+
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
-  let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
 }
@@ -1168,7 +1035,7 @@ class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let InOperandList = iops;
   let AsmString = asm;
   let Pattern = pattern;
-  list<Predicate> Predicates = [IsThumb1Only];
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
 }
 
 class T2I<dag oops, dag iops, InstrItinClass itin,
@@ -1186,17 +1053,23 @@ class T2Iso<dag oops, dag iops, InstrItinClass itin,
 class T2Ipc<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, itin, opc, asm, "", pattern>;
-class T2Ii8s4<bit P, bit W, bit load, dag oops, dag iops, InstrItinClass itin,
+class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, itin, opc, asm, "",
             pattern> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b00;
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<13> addr;
+  let Inst{31-25} = 0b1110100;
   let Inst{24}    = P;
-  let Inst{23}    = ?; // The U bit.
+  let Inst{23}    = addr{8};
   let Inst{22}    = 1;
   let Inst{21}    = W;
-  let Inst{20}    = load;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = addr{7-0};
 }
 
 class T2sI<dag oops, dag iops, InstrItinClass itin,
@@ -1210,9 +1083,11 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin,
             string asm, list<dag> pattern>
   : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
-class T2Ix2<dag oops, dag iops, InstrItinClass itin,
-            string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>;
+// Move to/from coprocessor instructions
+class T2Cop<dag oops, dag iops, string asm, list<dag> pattern>
+  : T2XI<oops, iops, NoItinerary, asm, pattern>, Requires<[IsThumb2, HasV6]> {
+  let Inst{31-28} = 0b1111;
+}
 
 // Two-address instructions
 class T2XIt<dag oops, dag iops, InstrItinClass itin,
@@ -1227,7 +1102,7 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
   : InstARM<am, Size4Bytes, im, ThumbFrm, GenericDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
   let Inst{31-27} = 0b11111;
@@ -1240,29 +1115,25 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
   // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
   let Inst{10}    = pre; // The P bit.
   let Inst{8}     = 1; // The W bit.
-}
 
-// Helper class for disassembly only
-// A6.3.16 & A6.3.17
-// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions.
-class T2I_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, dag iops,
-             InstrItinClass itin, string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
-  let Inst{31-27} = 0b11111;
-  let Inst{26-24} = 0b011;
-  let Inst{23}    = long;
-  let Inst{22-20} = op22_20;
-  let Inst{7-4}   = op7_4;
+  bits<9> addr;
+  let Inst{7-0} = addr{7-0};
+  let Inst{9}   = addr{8}; // Sign bit
+
+  bits<4> Rt;
+  bits<4> Rn;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{19-16} = Rn{3-0};
 }
 
 // Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
 class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb1Only, HasV5T];
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T];
 }
 
 // T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
 class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb1Only];
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
 }
 
 // T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
@@ -1281,10 +1152,13 @@ class VFPI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
            IndexMode im, Format f, InstrItinClass itin,
            string opc, string asm, string cstr, list<dag> pattern>
   : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
   list<Predicate> Predicates = [HasVFP2];
 }
 
@@ -1293,17 +1167,22 @@ class VFPXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
             IndexMode im, Format f, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
   let OutOperandList = oops;
   let InOperandList = iops;
   let AsmString = asm;
   let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
   list<Predicate> Predicates = [HasVFP2];
 }
 
 class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : VFPI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
-         opc, asm, "", pattern>;
+         opc, asm, "", pattern> {
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+}
 
 // ARM VFP addrmode5 loads and stores
 class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
@@ -1311,12 +1190,24 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
            string opc, string asm, list<dag> pattern>
   : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
          VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Dd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Dd{4};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Dd{3-0};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
-  let Inst{11-8}  = 0b1011;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
 
-  // 64-bit loads & stores operate on both NEON and VFP pipelines.
+  // Loads & stores operate on both NEON and VFP pipelines.
   let D = VFPNeonDomain;
 }
 
@@ -1325,10 +1216,36 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
            string opc, string asm, list<dag> pattern>
   : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
          VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Sd{0};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Sd{4-1};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
-  let Inst{11-8}  = 0b1010;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+// VFP Load / store multiple pseudo instructions.
+class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
+                     list<dag> pattern>
+  : InstARM<AddrMode4, Size4Bytes, IndexModeNone, Pseudo, VFPNeonDomain,
+            cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
 }
 
 // Load / store multiple
@@ -1336,21 +1253,40 @@ class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode4, Size4Bytes, im,
           VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{12};
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-0}   = regs{7-0};
+
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
-  let Inst{11-8}  = 0b1011;
-
-  // 64-bit loads & stores operate on both NEON and VFP pipelines.
-  let D = VFPNeonDomain;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
 }
 
 class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode4, Size4Bytes, im,
           VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{8};
+  let Inst{15-12} = regs{12-9};
+  let Inst{7-0}   = regs{7-0};
+
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
-  let Inst{11-8}  = 0b1010;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
 }
 
 // Double precision, unary
@@ -1358,10 +1294,21 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
            string asm, list<dag> pattern>
   : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{19-16} = opcod3;
-  let Inst{11-8}  = 0b1011;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
   let Inst{7-6}   = opcod4;
   let Inst{4}     = opcod5;
 }
@@ -1371,24 +1318,25 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
            dag iops, InstrItinClass itin, string opc, string asm,
            list<dag> pattern>
   : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
-  let Inst{27-23} = opcod1;
-  let Inst{21-20} = opcod2;
-  let Inst{11-8}  = 0b1011;
-  let Inst{6}     = op6;
-  let Inst{4}     = op4;
-}
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
 
-// Double precision, binary, VML[AS] (for additional predicate)
-class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
-           dag iops, InstrItinClass itin, string opc, string asm,
-           list<dag> pattern>
-  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
-  let Inst{11-8}  = 0b1011;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
   let Inst{6}     = op6;
   let Inst{4}     = op4;
-  list<Predicate> Predicates = [HasVFP2, UseVMLx];
 }
 
 // Single precision, unary
@@ -1396,16 +1344,27 @@ class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
            string asm, list<dag> pattern>
   : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{19-16} = opcod3;
-  let Inst{11-8}  = 0b1010;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
   let Inst{7-6}   = opcod4;
   let Inst{4}     = opcod5;
 }
 
-// Single precision unary, if no NEON
-// Same as ASuI except not available if NEON is enabled
+// Single precision unary, if no NEON. Same as ASuI except not available if
+// NEON is enabled.
 class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
             bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
             string asm, list<dag> pattern>
@@ -1418,20 +1377,47 @@ class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
 class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
            InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
-  let Inst{11-8}  = 0b1010;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
   let Inst{6}     = op6;
   let Inst{4}     = op4;
 }
 
-// Single precision binary, if no NEON
-// Same as ASbI except not available if NEON is enabled
+// Single precision binary, if no NEON. Same as ASbI except not available if
+// NEON is enabled.
 class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
             dag iops, InstrItinClass itin, string opc, string asm,
             list<dag> pattern>
   : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
   list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
 }
 
 // VFP conversion instructions
@@ -1502,9 +1488,7 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(
-                     !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)),
-                     !strconcat("\t", asm));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [HasNEON];
 }
@@ -1516,7 +1500,7 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm));
+  let AsmString = !strconcat(opc, "${p}", "\t", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [HasNEON];
 }
@@ -1531,6 +1515,25 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
   let Inst{21-20} = op21_20;
   let Inst{11-8}  = op11_8;
   let Inst{7-4}   = op7_4;
+
+  let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder";
+
+  bits<5> Vd;
+  bits<6> Rn;
+  bits<4> Rm;
+
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Rn{3-0};
+  let Inst{3-0}   = Rm{3-0};
+}
+
+class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc,
+          dt, asm, cstr, pattern> {
+  bits<3> lane;
 }
 
 class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
@@ -1541,11 +1544,22 @@ class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
   list<Predicate> Predicates = [HasNEON];
 }
 
+class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr,
+                  list<dag> pattern>
+  : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+
 class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string dt, string asm, string cstr, list<dag> pattern>
   : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
           pattern> {
   let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
 }
 
 class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
@@ -1553,6 +1567,7 @@ class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
   : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm,
            cstr, pattern> {
   let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
 }
 
 // NEON "one register and a modified immediate" format.
@@ -1569,6 +1584,16 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
   let Inst{6}     = op6;
   let Inst{5}     = op5;
   let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<13> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{24}    = SIMM{7};
+  let Inst{18-16} = SIMM{6-4};
+  let Inst{3-0}   = SIMM{3-0};
 }
 
 // NEON 2 vector register format.
@@ -1584,6 +1609,15 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{11-7}  = op11_7;
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
 }
 
 // Same as N2V except it doesn't have a datatype suffix.
@@ -1599,6 +1633,15 @@ class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{11-7}  = op11_7;
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
 }
 
 // NEON 2 vector register with immediate.
@@ -1612,6 +1655,17 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
   let Inst{7}    = op7;
   let Inst{6}    = op6;
   let Inst{4}    = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+  bits<6> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{21-16} = SIMM{5-0};
 }
 
 // NEON 3 vector register format.
@@ -1625,6 +1679,18 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{11-8}  = op11_8;
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
 }
 
 // Same as N3V except it doesn't have a data type suffix.
@@ -1639,13 +1705,25 @@ class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
   let Inst{11-8}  = op11_8;
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
 }
 
 // NEON VMOVs between scalar and core registers.
 class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string dt, string asm, list<dag> pattern>
-  : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain,
+  : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, NeonDomain,
             "", itin> {
   let Inst{27-20} = opcod1;
   let Inst{11-8}  = opcod2;
@@ -1654,11 +1732,21 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
 
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
-  let AsmString = !strconcat(
-                     !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)),
-                     !strconcat("\t", asm));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [HasNEON];
+
+  let PostEncoderMethod = "NEONThumb2DupPostEncoder";
+
+  bits<5> V;
+  bits<4> R;
+  bits<4> p;
+  bits<4> lane;
+
+  let Inst{31-28} = p{3-0};
+  let Inst{7}     = V{4};
+  let Inst{19-16} = V{3-0};
+  let Inst{15-12} = R{3-0};
 }
 class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                 dag oops, dag iops, InstrItinClass itin,
@@ -1687,6 +1775,15 @@ class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
   let Inst{11-7}  = 0b11000;
   let Inst{6}     = op6;
   let Inst{4}     = 0;
+
+  bits<5> Vd;
+  bits<5> Vm;
+  bits<4> lane;
+
+  let Inst{22}     = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0} = Vm{3-0};
 }
 
 // NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index ba228ff..6f48d96 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -33,13 +33,13 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   default: break;
   case ARM::LDR_PRE:
   case ARM::LDR_POST:
-    return ARM::LDR;
+    return ARM::LDRi12;
   case ARM::LDRH_PRE:
   case ARM::LDRH_POST:
     return ARM::LDRH;
   case ARM::LDRB_PRE:
   case ARM::LDRB_POST:
-    return ARM::LDRB;
+    return ARM::LDRBi12;
   case ARM::LDRSH_PRE:
   case ARM::LDRSH_POST:
     return ARM::LDRSH;
@@ -48,39 +48,14 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
     return ARM::LDRSB;
   case ARM::STR_PRE:
   case ARM::STR_POST:
-    return ARM::STR;
+    return ARM::STRi12;
   case ARM::STRH_PRE:
   case ARM::STRH_POST:
     return ARM::STRH;
   case ARM::STRB_PRE:
   case ARM::STRB_POST:
-    return ARM::STRB;
+    return ARM::STRBi12;
   }
 
   return 0;
 }
-
-void ARMInstrInfo::
-reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-              unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig,
-              const TargetRegisterInfo &TRI) const {
-  DebugLoc dl = Orig->getDebugLoc();
-  unsigned Opcode = Orig->getOpcode();
-  switch (Opcode) {
-  default:
-    break;
-  case ARM::MOVi2pieces: {
-    RI.emitLoadConstPool(MBB, I, dl,
-                         DestReg, SubIdx,
-                         Orig->getOperand(1).getImm(),
-                         (ARMCC::CondCodes)Orig->getOperand(2).getImm(),
-                         Orig->getOperand(3).getReg());
-    MachineInstr *NewMI = prior(I);
-    NewMI->getOperand(0).setSubReg(SubIdx);
-    return;
-  }
-  }
-
-  return ARMBaseInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, TRI);
-}
-
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
index 4563ffe..f2c7bdc 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -32,11 +32,6 @@ public:
   // if there is not such an opcode.
   unsigned getUnindexedOpcode(unsigned Opc) const;
 
-  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                     unsigned DestReg, unsigned SubIdx,
-                     const MachineInstr *Orig,
-                     const TargetRegisterInfo &TRI) const;
-
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
index e66f9b9..c827ce3d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -58,10 +58,9 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
                                                  SDTCisInt<2>]>;
 def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
 
-def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 0, []>;
-def SDT_ARMSYNCBARRIER    : SDTypeProfile<0, 0, []>;
-def SDT_ARMMEMBARRIERMCR  : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDT_ARMSYNCBARRIERMCR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 
@@ -70,33 +69,35 @@ def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
 
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
+def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
 
 def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
-                              [SDNPHasChain, SDNPOutFlag]>;
+                              [SDNPHasChain, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
-                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
 def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
-                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                SDNPVariadic]>;
 def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
-                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                SDNPVariadic]>;
 def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
-                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                SDNPVariadic]>;
 
 def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
-                              [SDNPHasChain, SDNPOptInFlag]>;
+                              [SDNPHasChain, SDNPOptInGlue]>;
 
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
-                              [SDNPInFlag]>;
+                              [SDNPInGlue]>;
 def ARMcneg          : SDNode<"ARMISD::CNEG", SDT_ARMCMov,
-                              [SDNPInFlag]>;
+                              [SDNPInGlue]>;
 
 def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
-                              [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+                              [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 
 def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
                               [SDNPHasChain]>;
@@ -106,40 +107,38 @@ def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
 def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
                               [SDNPHasChain]>;
 
-def ARMand           : SDNode<"ARMISD::AND", SDT_ARMAnd,
-                              [SDNPOutFlag]>;
-
 def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
-                              [SDNPOutFlag]>;
+                              [SDNPOutGlue]>;
 
 def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
-                              [SDNPOutFlag, SDNPCommutative]>;
+                              [SDNPOutGlue, SDNPCommutative]>;
 
 def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
 
-def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
-def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
-def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInFlag ]>;
+def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
 
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
                                SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>;
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
-                                SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>;
+                               SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>;
+def ARMeh_sjlj_dispatchsetup: SDNode<"ARMISD::EH_SJLJ_DISPATCHSETUP",
+                               SDT_ARMEH_SJLJ_DispatchSetup, [SDNPHasChain]>;
+
 
 def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
                                [SDNPHasChain]>;
-def ARMSyncBarrier    : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain]>;
-def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERMCR,
-                               [SDNPHasChain]>;
-def ARMSyncBarrierMCR : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERMCR,
+def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
                                [SDNPHasChain]>;
+def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDTPrefetch,
+                               [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
 
 def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
 
-def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, 
-                        [SDNPHasChain,  SDNPOptInFlag, SDNPVariadic]>;
+def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
 
 def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
@@ -147,34 +146,40 @@ def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
-def HasV4T           : Predicate<"Subtarget->hasV4TOps()">;
+def HasV4T           : Predicate<"Subtarget->hasV4TOps()">, AssemblerPredicate;
 def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
 def HasV5T           : Predicate<"Subtarget->hasV5TOps()">;
-def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">;
-def HasV6            : Predicate<"Subtarget->hasV6Ops()">;
-def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">;
+def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">, AssemblerPredicate;
+def HasV6            : Predicate<"Subtarget->hasV6Ops()">, AssemblerPredicate;
+def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">, AssemblerPredicate;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
-def HasV7            : Predicate<"Subtarget->hasV7Ops()">;
+def HasV7            : Predicate<"Subtarget->hasV7Ops()">, AssemblerPredicate;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
-def HasVFP2          : Predicate<"Subtarget->hasVFP2()">;
-def HasVFP3          : Predicate<"Subtarget->hasVFP3()">;
-def HasNEON          : Predicate<"Subtarget->hasNEON()">;
-def HasDivide        : Predicate<"Subtarget->hasDivide()">;
-def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">;
-def HasDB            : Predicate<"Subtarget->hasDataBarrier()">;
+def HasVFP2          : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate;
+def HasVFP3          : Predicate<"Subtarget->hasVFP3()">, AssemblerPredicate;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate;
+def HasFP16          : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate;
+def HasDivide        : Predicate<"Subtarget->hasDivide()">, AssemblerPredicate;
+def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
+                                 AssemblerPredicate;
+def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
+                                 AssemblerPredicate;
+def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
+                                 AssemblerPredicate;
 def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
-def IsThumb          : Predicate<"Subtarget->isThumb()">;
+def IsThumb          : Predicate<"Subtarget->isThumb()">, AssemblerPredicate;
 def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
-def IsThumb2         : Predicate<"Subtarget->isThumb2()">;
-def IsARM            : Predicate<"!Subtarget->isThumb()">;
+def IsThumb2         : Predicate<"Subtarget->isThumb2()">, AssemblerPredicate;
+def IsARM            : Predicate<"!Subtarget->isThumb()">, AssemblerPredicate;
 def IsDarwin         : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin      : Predicate<"!Subtarget->isTargetDarwin()">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
 def UseMovt          : Predicate<"Subtarget->useMovt()">;
 def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
-def UseVMLx          : Predicate<"Subtarget->useVMLx()">;
+def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@@ -199,12 +204,6 @@ def so_imm_not_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
 }]>;
 
-// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24.
-def rot_imm : PatLeaf<(i32 imm), [{
-  int32_t v = (int32_t)N->getZExtValue();
-  return v == 8 || v == 16 || v == 24;
-}]>;
-
 /// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15].
 def imm1_15 : PatLeaf<(i32 imm), [{
   return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16;
@@ -217,12 +216,12 @@ def imm16_31 : PatLeaf<(i32 imm), [{
 
 def so_imm_neg :
   PatLeaf<(imm), [{
-    return ARM_AM::getSOImmVal(-(int)N->getZExtValue()) != -1;
+    return ARM_AM::getSOImmVal(-(uint32_t)N->getZExtValue()) != -1;
   }], so_imm_neg_XFORM>;
 
 def so_imm_not :
   PatLeaf<(imm), [{
-    return ARM_AM::getSOImmVal(~(int)N->getZExtValue()) != -1;
+    return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
   }], so_imm_not_XFORM>;
 
 // sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
@@ -230,15 +229,6 @@ def sext_16_node : PatLeaf<(i32 GPR:$a), [{
   return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
 }]>;
 
-/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
-/// e.g., 0xf000ffff
-def bf_inv_mask_imm : Operand<i32>,
-                      PatLeaf<(imm), [{
-  return ARM::isBitFieldInvertedMask(N->getZExtValue());
-}] > {
-  let PrintMethod = "printBitfieldInvMaskImmOperand";
-}
-
 /// Split a 32-bit immediate into two 16 bit parts.
 def hi16 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
@@ -273,28 +263,103 @@ def sube_live_carry :
   PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS),
   [{return N->hasAnyUseOfValue(1);}]>;
 
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'xor' node with a single use.
+def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'fmul' node with a single use.
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
+  return N->hasOneUse();
+}]>;
+
+// An 'fadd' node which checks for single non-hazardous use.
+def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
+// An 'fsub' node which checks for single non-hazardous use.
+def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
 //
 
 // Branch target.
-def brtarget : Operand<OtherVT>;
+// FIXME: rename brtarget to t2_brtarget
+def brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+}
+
+// FIXME: get rid of this one?
+def uncondbrtarget : Operand<OtherVT> {
+  let EncoderMethod = "getUnconditionalBranchTargetOpValue";
+}
+
+// Branch target for ARM. Handles conditional/unconditional
+def br_target : Operand<OtherVT> {
+  let EncoderMethod = "getARMBranchTargetOpValue";
+}
+
+// Call target.
+// FIXME: rename bltarget to t2_bl_target?
+def bltarget : Operand<i32> {
+  // Encoded the same as branch targets.
+  let EncoderMethod = "getBranchTargetOpValue";
+}
+
+// Call target for ARM. Handles conditional/unconditional
+// FIXME: rename bl_target to t2_bltarget?
+def bl_target : Operand<i32> {
+  // Encoded the same as branch targets.
+  let EncoderMethod = "getARMBranchTargetOpValue";
+}
+
 
 // A list of registers separated by comma. Used by load/store multiple.
+def RegListAsmOperand : AsmOperandClass {
+  let Name = "RegList";
+  let SuperClasses = [];
+}
+
+def DPRRegListAsmOperand : AsmOperandClass {
+  let Name = "DPRRegList";
+  let SuperClasses = [];
+}
+
+def SPRRegListAsmOperand : AsmOperandClass {
+  let Name = "SPRRegList";
+  let SuperClasses = [];
+}
+
 def reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = RegListAsmOperand;
   let PrintMethod = "printRegisterList";
 }
 
-// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
-def cpinst_operand : Operand<i32> {
-  let PrintMethod = "printCPInstOperand";
+def dpr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = DPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
 }
 
-def jtblock_operand : Operand<i32> {
-  let PrintMethod = "printJTBlockOperand";
+def spr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = SPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
 }
-def jt2block_operand : Operand<i32> {
-  let PrintMethod = "printJT2BlockOperand";
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  let PrintMethod = "printCPInstOperand";
 }
 
 // Local PC labels.
@@ -302,6 +367,22 @@ def pclabel : Operand<i32> {
   let PrintMethod = "printPCLabel";
 }
 
+// ADR instruction labels.
+def adrlabel : Operand<i32> {
+  let EncoderMethod = "getAdrLabelOpValue";
+}
+
+def neon_vcvt_imm32 : Operand<i32> {
+  let EncoderMethod = "getNEONVcvtImm32OpValue";
+}
+
+// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
+def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
+    int32_t v = (int32_t)N->getZExtValue();
+    return v == 8 || v == 16 || v == 24; }]> {
+  let EncoderMethod = "getRotImmOpValue";
+}
+
 // shift_imm: An integer that encodes a shift amount and the type of shift
 // (currently either asr or lsl) using the same encoding used for the
 // immediates in so_reg operands.
@@ -313,73 +394,120 @@ def shift_imm : Operand<i32> {
 def so_reg : Operand<i32>,    // reg reg imm
              ComplexPattern<i32, 3, "SelectShifterOperandReg",
                             [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegOpValue";
+  let PrintMethod = "printSORegOperand";
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+def shift_so_reg : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 3, "SelectShiftShifterOperandReg",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegOpValue";
   let PrintMethod = "printSORegOperand";
   let MIOperandInfo = (ops GPR, GPR, i32imm);
 }
 
 // so_imm - Match a 32-bit shifter_operand immediate operand, which is an
-// 8-bit immediate rotated by an arbitrary number of bits.  so_imm values are
-// represented in the imm field in the same 12-bit form that they are encoded
-// into so_imm instructions: the 8-bit immediate is the least significant bits
-// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11].
+// 8-bit immediate rotated by an arbitrary number of bits.
 def so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_so_imm(N); }]> {
+  let EncoderMethod = "getSOImmOpValue";
   let PrintMethod = "printSOImmOperand";
 }
 
 // Break so_imm's up into two pieces.  This handles immediates with up to 16
 // bits set in them.  This uses so_imm2part to match and so_imm2part_[12] to
 // get the first/second pieces.
-def so_imm2part : Operand<i32>,
-                  PatLeaf<(imm), [{
+def so_imm2part : PatLeaf<(imm), [{
       return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
-    }]> {
-  let PrintMethod = "printSOImm2PartOperand";
-}
+}]>;
 
-def so_imm2part_1 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
+/// arm_i32imm - True for +V6T2, or true only if so_imm2part is true.
+///
+def arm_i32imm : PatLeaf<(imm), [{
+  if (Subtarget->hasV6T2Ops())
+    return true;
+  return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
 }]>;
 
-def so_imm2part_2 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def imm0_31 : Operand<i32>, PatLeaf<(imm), [{
+  return (int32_t)N->getZExtValue() < 32;
 }]>;
 
-def so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{
-      return ARM_AM::isSOImmTwoPartVal(-(int)N->getZExtValue());
-    }]> {
-  let PrintMethod = "printSOImm2PartOperand";
+/// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'.
+def imm0_31_m1 : Operand<i32>, PatLeaf<(imm), [{
+  return (int32_t)N->getZExtValue() < 32;
+}]> {
+  let EncoderMethod = "getImmMinusOneOpValue";
 }
 
-def so_neg_imm2part_1 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getSOImmTwoPartFirst(-(int)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
-}]>;
+// i32imm_hilo16 - For movt/movw - sets the MC Encoder method.
+// The imm is split into imm{15-12}, imm{11-0}
+//
+def i32imm_hilo16 : Operand<i32> {
+  let EncoderMethod = "getHiLo16ImmOpValue";
+}
 
-def so_neg_imm2part_2 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getSOImmTwoPartSecond(-(int)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
-}]>;
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def bf_inv_mask_imm : Operand<i32>,
+                      PatLeaf<(imm), [{
+  return ARM::isBitFieldInvertedMask(N->getZExtValue());
+}] > {
+  let EncoderMethod = "getBitfieldInvertedMaskOpValue";
+  let PrintMethod = "printBitfieldInvMaskImmOperand";
+}
 
-/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
-def imm0_31 : Operand<i32>, PatLeaf<(imm), [{
-  return (int32_t)N->getZExtValue() < 32;
+/// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p
+def lsb_pos_imm : Operand<i32>, PatLeaf<(imm), [{
+  return isInt<5>(N->getSExtValue());
 }]>;
 
+/// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p
+def width_imm : Operand<i32>, PatLeaf<(imm), [{
+  return N->getSExtValue() > 0 &&  N->getSExtValue() <= 32;
+}] > {
+  let EncoderMethod = "getMsbOpValue";
+}
+
 // Define ARM specific addressing modes.
 
-// addrmode2 := reg +/- reg shop imm
+
+// addrmode_imm12 := reg +/- imm12
+//
+def addrmode_imm12 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
+  // 12-bit immediate operand. Note that instructions using this encode
+  // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other
+  // immediate values are as normal.
+
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let PrintMethod = "printAddrModeImm12Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+// ldst_so_reg := reg +/- reg shop imm
+//
+def ldst_so_reg : Operand<i32>,
+                  ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
+  let EncoderMethod = "getLdStSORegOpValue";
+  // FIXME: Simplify the printer
+  let PrintMethod = "printAddrMode2Operand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
 // addrmode2 := reg +/- imm12
+//           := reg +/- reg shop imm
 //
 def addrmode2 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+  let EncoderMethod = "getAddrMode2OpValue";
   let PrintMethod = "printAddrMode2Operand";
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
 def am2offset : Operand<i32>,
-                ComplexPattern<i32, 2, "SelectAddrMode2Offset", []> {
+                ComplexPattern<i32, 2, "SelectAddrMode2Offset",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
   let PrintMethod = "printAddrMode2OffsetOperand";
   let MIOperandInfo = (ops GPR, i32imm);
 }
@@ -389,22 +517,29 @@ def am2offset : Operand<i32>,
 //
 def addrmode3 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+  let EncoderMethod = "getAddrMode3OpValue";
   let PrintMethod = "printAddrMode3Operand";
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
 def am3offset : Operand<i32>,
-                ComplexPattern<i32, 2, "SelectAddrMode3Offset", []> {
+                ComplexPattern<i32, 2, "SelectAddrMode3Offset",
+                               [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode3OffsetOpValue";
   let PrintMethod = "printAddrMode3OffsetOperand";
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
-// addrmode4 := reg, <mode|W>
+// ldstm_mode := {ia, ib, da, db}
 //
-def addrmode4 : Operand<i32>,
-                ComplexPattern<i32, 2, "SelectAddrMode4", []> {
-  let PrintMethod = "printAddrMode4Operand";
-  let MIOperandInfo = (ops GPR:$addr, i32imm);
+def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
+  let EncoderMethod = "getLdStmModeOpValue";
+  let PrintMethod = "printLdStmModeOperand";
+}
+
+def MemMode5AsmOperand : AsmOperandClass {
+  let Name = "MemMode5";
+  let SuperClasses = [];
 }
 
 // addrmode5 := reg +/- imm8*4
@@ -413,19 +548,32 @@ def addrmode5 : Operand<i32>,
                 ComplexPattern<i32, 2, "SelectAddrMode5", []> {
   let PrintMethod = "printAddrMode5Operand";
   let MIOperandInfo = (ops GPR:$base, i32imm);
+  let ParserMatchClass = MemMode5AsmOperand;
+  let EncoderMethod = "getAddrMode5OpValue";
 }
 
-// addrmode6 := reg with optional writeback
+// addrmode6 := reg with optional alignment
 //
 def addrmode6 : Operand<i32>,
-                ComplexPattern<i32, 2, "SelectAddrMode6", []> {
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
   let PrintMethod = "printAddrMode6Operand";
   let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
 }
 
 def am6offset : Operand<i32> {
   let PrintMethod = "printAddrMode6OffsetOperand";
   let MIOperandInfo = (ops GPR);
+  let EncoderMethod = "getAddrMode6OffsetOpValue";
+}
+
+// Special version of addrmode6 to handle alignment encoding for VLD-dup
+// instructions, specifically VLD4-dup.
+def addrmode6dup : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
 }
 
 // addrmodepc := pc + reg
@@ -440,6 +588,28 @@ def nohash_imm : Operand<i32> {
   let PrintMethod = "printNoHashImmediate";
 }
 
+def CoprocNumAsmOperand : AsmOperandClass {
+  let Name = "CoprocNum";
+  let SuperClasses = [];
+  let ParserMethod = "tryParseCoprocNumOperand";
+}
+
+def CoprocRegAsmOperand : AsmOperandClass {
+  let Name = "CoprocReg";
+  let SuperClasses = [];
+  let ParserMethod = "tryParseCoprocRegOperand";
+}
+
+def p_imm : Operand<i32> {
+  let PrintMethod = "printPImmediate";
+  let ParserMatchClass = CoprocNumAsmOperand;
+}
+
+def c_imm : Operand<i32> {
+  let PrintMethod = "printCImmediate";
+  let ParserMatchClass = CoprocRegAsmOperand;
+}
+
 //===----------------------------------------------------------------------===//
 
 include "ARMInstrFormats.td"
@@ -450,55 +620,93 @@ include "ARMInstrFormats.td"
 
 /// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
 /// binop that produces a value.
-multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
-                        bit Commutable = 0> {
+multiclass AsI1_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                        PatFrag opnode, bit Commutable = 0> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
-  def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-               IIC_iALUi, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
     let Inst{25} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
   }
   }
-  def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
-               IIC_iALUr, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
-    let Inst{11-4} = 0b00000000;
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
     let Inst{25} = 0;
     let isCommutable = Commutable;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
   }
-  def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-               IIC_iALUsr, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
+  def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
     let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = shift;
   }
 }
 
 /// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
 /// instruction modifies the CPSR register.
-let Defs = [CPSR] in {
-multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
-                         bit Commutable = 0> {
-  def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-               IIC_iALUi, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
-    let Inst{20} = 1;
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+multiclass AI1_bin_s_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                         PatFrag opnode, bit Commutable = 0> {
+  def ri : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
     let Inst{25} = 1;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
   }
-  def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
-               IIC_iALUr, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
+  def rr : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
     let isCommutable = Commutable;
-    let Inst{11-4} = 0b00000000;
-    let Inst{20} = 1;
     let Inst{25} = 0;
-  }
-  def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-               IIC_iALUsr, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
     let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+  }
+  def rs : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
     let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = shift;
   }
 }
 }
@@ -507,146 +715,233 @@ multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 /// patterns. Similar to AsI1_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
 let isCompare = 1, Defs = [CPSR] in {
-multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode,
-                       bit Commutable = 0> {
-  def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi,
-               opc, "\t$a, $b",
-               [(opnode GPR:$a, so_imm:$b)]> {
-    let Inst{20} = 1;
+multiclass AI1_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode, bit Commutable = 0> {
+  def ri : AI1<opcod, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii,
+               opc, "\t$Rn, $imm",
+               [(opnode GPR:$Rn, so_imm:$imm)]> {
+    bits<4> Rn;
+    bits<12> imm;
     let Inst{25} = 1;
-  }
-  def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, IIC_iCMPr,
-               opc, "\t$a, $b",
-               [(opnode GPR:$a, GPR:$b)]> {
-    let Inst{11-4} = 0b00000000;
     let Inst{20} = 1;
-    let Inst{25} = 0;
-    let isCommutable = Commutable;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-0} = imm;
   }
-  def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, IIC_iCMPsr,
-               opc, "\t$a, $b",
-               [(opnode GPR:$a, so_reg:$b)]> {
+  def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir,
+               opc, "\t$Rn, $Rm",
+               [(opnode GPR:$Rn, GPR:$Rm)]> {
+    bits<4> Rn;
+    bits<4> Rm;
+    let isCommutable = Commutable;
+    let Inst{25} = 0;
     let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+  }
+  def rs : AI1<opcod, (outs), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPR:$Rn, so_reg:$shift)]> {
+    bits<4> Rn;
+    bits<12> shift;
     let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-0} = shift;
   }
 }
 }
 
-/// AI_unary_rrot - A unary operation with two forms: one whose operand is a
+/// AI_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 /// FIXME: Remove the 'r' variant. Its rot_imm is zero.
-multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
-  def r     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src),
-                 IIC_iUNAr, opc, "\t$dst, $src",
-                 [(set GPR:$dst, (opnode GPR:$src))]>,
+multiclass AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode> {
+  def r     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm),
+                 IIC_iEXTr, opc, "\t$Rd, $Rm",
+                 [(set GPR:$Rd, (opnode GPR:$Rm))]>,
               Requires<[IsARM, HasV6]> {
-    let Inst{11-10} = 0b00;
+    bits<4> Rd;
+    bits<4> Rm;
     let Inst{19-16} = 0b1111;
+    let Inst{15-12} = Rd;
+    let Inst{11-10} = 0b00;
+    let Inst{3-0}   = Rm;
   }
-  def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot),
-                 IIC_iUNAsi, opc, "\t$dst, $src, ror $rot",
-                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>,
+  def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot),
+                 IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot",
+                 [(set GPR:$Rd, (opnode (rotr GPR:$Rm, rot_imm:$rot)))]>,
               Requires<[IsARM, HasV6]> {
+    bits<4> Rd;
+    bits<4> Rm;
+    bits<2> rot;
     let Inst{19-16} = 0b1111;
+    let Inst{15-12} = Rd;
+    let Inst{11-10} = rot;
+    let Inst{3-0}   = Rm;
   }
 }
 
-multiclass AI_unary_rrot_np<bits<8> opcod, string opc> {
-  def r     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src),
-                 IIC_iUNAr, opc, "\t$dst, $src",
+multiclass AI_ext_rrot_np<bits<8> opcod, string opc> {
+  def r     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm),
+                 IIC_iEXTr, opc, "\t$Rd, $Rm",
                  [/* For disassembly only; pattern left blank */]>,
               Requires<[IsARM, HasV6]> {
-    let Inst{11-10} = 0b00;
     let Inst{19-16} = 0b1111;
+    let Inst{11-10} = 0b00;
   }
-  def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot),
-                 IIC_iUNAsi, opc, "\t$dst, $src, ror $rot",
+  def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot),
+                 IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot",
                  [/* For disassembly only; pattern left blank */]>,
               Requires<[IsARM, HasV6]> {
+    bits<2> rot;
     let Inst{19-16} = 0b1111;
+    let Inst{11-10} = rot;
   }
 }
 
-/// AI_bin_rrot - A binary operation with two forms: one whose operand is a
+/// AI_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> {
-  def rr     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
-                  IIC_iALUr, opc, "\t$dst, $LHS, $RHS",
-                  [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>,
+multiclass AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode> {
+  def rr     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm",
+                  [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
                Requires<[IsARM, HasV6]> {
+    bits<4> Rd;
+    bits<4> Rm;
+    bits<4> Rn;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
     let Inst{11-10} = 0b00;
+    let Inst{9-4}   = 0b000111;
+    let Inst{3-0}   = Rm;
+  }
+  def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
+                                             rot_imm:$rot),
+                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
+                  [(set GPR:$Rd, (opnode GPR:$Rn,
+                                          (rotr GPR:$Rm, rot_imm:$rot)))]>,
+                  Requires<[IsARM, HasV6]> {
+    bits<4> Rd;
+    bits<4> Rm;
+    bits<4> Rn;
+    bits<2> rot;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-10} = rot;
+    let Inst{9-4}   = 0b000111;
+    let Inst{3-0}   = Rm;
   }
-  def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS,
-                                              i32imm:$rot),
-                  IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot",
-                  [(set GPR:$dst, (opnode GPR:$LHS,
-                                          (rotr GPR:$RHS, rot_imm:$rot)))]>,
-                  Requires<[IsARM, HasV6]>;
 }
 
 // For disassembly only.
-multiclass AI_bin_rrot_np<bits<8> opcod, string opc> {
-  def rr     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
-                  IIC_iALUr, opc, "\t$dst, $LHS, $RHS",
+multiclass AI_exta_rrot_np<bits<8> opcod, string opc> {
+  def rr     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm",
                   [/* For disassembly only; pattern left blank */]>,
                Requires<[IsARM, HasV6]> {
     let Inst{11-10} = 0b00;
   }
-  def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS,
-                                              i32imm:$rot),
-                  IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot",
+  def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
+                                             rot_imm:$rot),
+                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
                   [/* For disassembly only; pattern left blank */]>,
-                  Requires<[IsARM, HasV6]>;
+                  Requires<[IsARM, HasV6]> {
+    bits<4> Rn;
+    bits<2> rot;
+    let Inst{19-16} = Rn;
+    let Inst{11-10} = rot;
+  }
 }
 
 /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
 let Uses = [CPSR] in {
 multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
-  def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                DPFrm, IIC_iALUi, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
                Requires<[IsARM]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
     let Inst{25} = 1;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
   }
-  def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                DPFrm, IIC_iALUr, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
                Requires<[IsARM]> {
-    let isCommutable = Commutable;
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
     let Inst{11-4} = 0b00000000;
     let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
   }
-  def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                DPSoRegFrm, IIC_iALUsr, opc, "\t$dst, $a, $b",
-               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
+  def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                DPSoRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>,
                Requires<[IsARM]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
     let Inst{25} = 0;
+    let Inst{11-0} = shift;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
   }
 }
 // Carry setting variants
-let Defs = [CPSR] in {
+let isCodeGenOnly = 1, Defs = [CPSR] in {
 multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
-  def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                DPFrm, IIC_iALUi, !strconcat(opc, "\t$dst, $a, $b"),
-               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
+  def Sri : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, !strconcat(opc, "\t$Rd, $Rn, $imm"),
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
                Requires<[IsARM]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
     let Inst{20} = 1;
     let Inst{25} = 1;
   }
-  def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                DPFrm, IIC_iALUr, !strconcat(opc, "\t$dst, $a, $b"),
-               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
+  def Srr : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, !strconcat(opc, "\t$Rd, $Rn, $Rm"),
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
                Requires<[IsARM]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
     let Inst{11-4} = 0b00000000;
+    let isCommutable = Commutable;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
     let Inst{20} = 1;
     let Inst{25} = 0;
   }
-  def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$dst, $a, $b"),
-               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
+  def Srs : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$Rd, $Rn, $shift"),
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>,
                Requires<[IsARM]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{11-0} = shift;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
     let Inst{20} = 1;
     let Inst{25} = 0;
   }
@@ -654,6 +949,62 @@ multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 }
 }
 
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                  [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift),
+                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                 [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPR:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPR:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift),
+                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                 [(opnode GPR:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -669,8 +1020,7 @@ multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 let neverHasSideEffects = 1, isNotDuplicable = 1 in
 def CONSTPOOL_ENTRY :
 PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
-                    i32imm:$size), NoItinerary,
-           "${instid:label} ${cpidx:cpentry}", []>;
+                    i32imm:$size), NoItinerary, []>;
 
 // FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
 // from removing one half of the matched pairs. That breaks PEI, which assumes
@@ -678,12 +1028,10 @@ PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
 let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
 def ADJCALLSTACKUP :
 PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
-           "${:comment} ADJCALLSTACKUP $amt1",
            [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
 
 def ADJCALLSTACKDOWN :
 PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
-           "${:comment} ADJCALLSTACKDOWN $amt",
            [(ARMcallseq_start timm:$amt)]>;
 }
 
@@ -691,6 +1039,7 @@ def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
+  let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000000;
 }
 
@@ -698,6 +1047,7 @@ def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
+  let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000001;
 }
 
@@ -705,6 +1055,7 @@ def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
+  let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000010;
 }
 
@@ -712,6 +1063,7 @@ def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
+  let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000011;
 }
 
@@ -719,14 +1071,22 @@ def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel",
              "\t$dst, $a, $b",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
   let Inst{27-20} = 0b01101000;
   let Inst{7-4} = 0b1011;
+  let Inst{11-8} = 0b1111;
 }
 
 def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
+  let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000100;
 }
 
@@ -735,154 +1095,174 @@ def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "",
 def BKPT : AI<(outs), (ins i32imm:$val), MiscFrm, NoItinerary, "bkpt", "\t$val",
               [/* For disassembly only; pattern left blank */]>,
            Requires<[IsARM]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
   let Inst{27-20} = 0b00010010;
   let Inst{7-4} = 0b0111;
 }
 
-// Change Processor State is a system instruction -- for disassembly only.
-// The singleton $opt operand contains the following information:
-// opt{4-0} = mode from Inst{4-0}
-// opt{5} = changemode from Inst{17}
-// opt{8-6} = AIF from Inst{8-6}
-// opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable
-def CPS : AXI<(outs), (ins cps_opt:$opt), MiscFrm, NoItinerary, "cps$opt",
-              [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsARM]> {
+// Change Processor State is a system instruction -- for disassembly and
+// parsing only.
+// FIXME: Since the asm parser has currently no clean way to handle optional
+// operands, create 3 versions of the same instruction. Once there's a clean
+// framework to represent optional operands, change this behavior.
+class CPS<dag iops, string asm_ops>
+  : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
+        [/* For disassembly only; pattern left blank */]>, Requires<[IsARM]> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
   let Inst{31-28} = 0b1111;
   let Inst{27-20} = 0b00010000;
-  let Inst{16} = 0;
-  let Inst{5} = 0;
+  let Inst{19-18} = imod;
+  let Inst{17}    = M; // Enabled if mode is set;
+  let Inst{16}    = 0;
+  let Inst{8-6}   = iflags;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = mode;
 }
 
+let M = 1 in
+  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+                  "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;
+
+let imod = 0, iflags = 0, M = 1 in
+  def CPS1p : CPS<(ins i32imm:$mode), "\t$mode">;
+
 // Preload signals the memory system of possible future data/instruction access.
 // These are for disassembly only.
-//
-// A8.6.117, A8.6.118.  Different instructions are generated for #0 and #-0.
-// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc.
-multiclass APreLoad<bit data, bit read, string opc> {
+multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
-  def i : AXI<(outs), (ins GPR:$base, neg_zero:$imm), MiscFrm, NoItinerary,
-               !strconcat(opc, "\t[$base, $imm]"), []> {
+  def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload,
+                !strconcat(opc, "\t$addr"),
+                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]> {
+    bits<4> Rt;
+    bits<17> addr;
     let Inst{31-26} = 0b111101;
     let Inst{25} = 0; // 0 for immediate form
     let Inst{24} = data;
+    let Inst{23} = addr{12};        // U (add = ('U' == 1))
     let Inst{22} = read;
     let Inst{21-20} = 0b01;
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = addr{11-0};   // imm12
   }
 
-  def r : AXI<(outs), (ins addrmode2:$addr), MiscFrm, NoItinerary,
-               !strconcat(opc, "\t$addr"), []> {
+  def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
+               !strconcat(opc, "\t$shift"),
+               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]> {
+    bits<17> shift;
     let Inst{31-26} = 0b111101;
     let Inst{25} = 1; // 1 for register form
     let Inst{24} = data;
+    let Inst{23} = shift{12};    // U (add = ('U' == 1))
     let Inst{22} = read;
     let Inst{21-20} = 0b01;
-    let Inst{4} = 0;
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = shift{11-0};
   }
 }
 
-defm PLD  : APreLoad<1, 1, "pld">;
-defm PLDW : APreLoad<1, 0, "pldw">;
-defm PLI  : APreLoad<0, 1, "pli">;
-
-def SETENDBE : AXI<(outs),(ins), MiscFrm, NoItinerary, "setend\tbe",
-                   [/* For disassembly only; pattern left blank */]>,
-               Requires<[IsARM]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{27-20} = 0b00010000;
-  let Inst{16} = 1;
-  let Inst{9} = 1;
-  let Inst{7-4} = 0b0000;
-}
+defm PLD  : APreLoad<1, 1, "pld">,  Requires<[IsARM]>;
+defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
+defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
 
-def SETENDLE : AXI<(outs),(ins), MiscFrm, NoItinerary, "setend\tle",
-                   [/* For disassembly only; pattern left blank */]>,
+def SETEND : AXI<(outs),(ins setend_op:$end), MiscFrm, NoItinerary,
+                 "setend\t$end",
+                 [/* For disassembly only; pattern left blank */]>,
                Requires<[IsARM]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{27-20} = 0b00010000;
-  let Inst{16} = 1;
-  let Inst{9} = 0;
-  let Inst{7-4} = 0b0000;
+  bits<1> end;
+  let Inst{31-10} = 0b1111000100000001000000;
+  let Inst{9} = end;
+  let Inst{8-0} = 0;
 }
 
 def DBG : AI<(outs), (ins i32imm:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
              [/* For disassembly only; pattern left blank */]>,
           Requires<[IsARM, HasV7]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{7-4} = 0b1111;
+  bits<4> opt;
+  let Inst{27-4} = 0b001100100000111100001111;
+  let Inst{3-0} = opt;
 }
 
 // A5.4 Permanently UNDEFINED instructions.
-// FIXME: Temporary emitted as raw bytes until this pseudo-op will be added to
-// binutils
 let isBarrier = 1, isTerminator = 1 in
-def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, 
-               ".long 0xe7ffdefe ${:comment} trap", [(trap)]>,
+def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
            Requires<[IsARM]> {
-  let Inst{27-25} = 0b011;
-  let Inst{24-20} = 0b11111;
-  let Inst{7-5} = 0b111;
-  let Inst{4} = 0b1;
+  let Inst = 0xe7ffdefe;
 }
 
 // Address computation and loads and stores in PIC mode.
 let isNotDuplicable = 1 in {
-def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
-                  Pseudo, IIC_iALUr, "\n$cp:\n\tadd$p\t$dst, pc, $a",
-                   [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
+def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
+                            Size4Bytes, IIC_iALUr,
+                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
 
 let AddedComplexity = 10 in {
-def PICLDR  : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                  Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr$p\t$dst, $addr",
-                  [(set GPR:$dst, (load addrmodepc:$addr))]>;
+def PICLDR  : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                            Size4Bytes, IIC_iLoad_r,
+                            [(set GPR:$dst, (load addrmodepc:$addr))]>;
 
-def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrh${p}\t$dst, $addr",
-                  [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>;
+def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            Size4Bytes, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>;
 
-def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-                Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrb${p}\t$dst, $addr",
-                  [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>;
+def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            Size4Bytes, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>;
 
-def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-               Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsh${p}\t$dst, $addr",
-                  [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>;
+def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            Size4Bytes, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>;
 
-def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
-               Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsb${p}\t$dst, $addr",
-                  [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>;
+def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            Size4Bytes, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>;
 }
 let AddedComplexity = 10 in {
-def PICSTR  : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
-               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr$p\t$src, $addr",
-               [(store GPR:$src, addrmodepc:$addr)]>;
+def PICSTR  : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      Size4Bytes, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>;
 
-def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
-               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrh${p}\t$src, $addr",
-               [(truncstorei16 GPR:$src, addrmodepc:$addr)]>;
+def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      Size4Bytes, IIC_iStore_bh_r, [(truncstorei16 GPR:$src,
+                                                   addrmodepc:$addr)]>;
 
-def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
-               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrb${p}\t$src, $addr",
-               [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      Size4Bytes, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
 }
 } // isNotDuplicable = 1
 
 
 // LEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
-let neverHasSideEffects = 1 in {
-let isReMaterializable = 1 in
-def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p),
-                    Pseudo, IIC_iALUi,
-                    "adr$p\t$dst, #$label", []>;
-
-} // neverHasSideEffects
-def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
-                           (ins i32imm:$label, nohash_imm:$id, pred:$p),
-                      Pseudo, IIC_iALUi,
-                      "adr$p\t$dst, #${label}_${id}", []> {
-    let Inst{25} = 1;
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+// The 'adr' mnemonic encodes differently if the label is before or after
+// the instruction. The {24-21} opcode bits are set by the fixup, as we don't
+// know until then which form of the instruction will be used.
+def ADR : AI1<0, (outs GPR:$Rd), (ins adrlabel:$label),
+                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, #$label", []> {
+  bits<4> Rd;
+  bits<12> label;
+  let Inst{27-25} = 0b001;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = label;
 }
+def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
+                    Size4Bytes, IIC_iALUi, []>;
+
+def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
+                      (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                      Size4Bytes, IIC_iALUi, []>;
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
@@ -893,159 +1273,139 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
   def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
                   "bx", "\tlr", [(ARMretflag)]>,
                Requires<[IsARM, HasV4T]> {
-    let Inst{3-0}   = 0b1110;
-    let Inst{7-4}   = 0b0001;
-    let Inst{19-8}  = 0b111111111111;
-    let Inst{27-20} = 0b00010010;
+    let Inst{27-0}  = 0b0001001011111111111100011110;
   }
 
   // ARMV4 only
-  def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, 
+  def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
                   "mov", "\tpc, lr", [(ARMretflag)]>,
                Requires<[IsARM, NoV4T]> {
-    let Inst{11-0}  = 0b000000001110;
-    let Inst{15-12} = 0b1111;
-    let Inst{19-16} = 0b0000;
-    let Inst{27-20} = 0b00011010;
+    let Inst{27-0} = 0b0001101000001111000000001110;
   }
 }
 
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   // ARMV4T and above
-  def BRIND : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
+  def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
                   [(brind GPR:$dst)]>,
               Requires<[IsARM, HasV4T]> {
-    let Inst{7-4}   = 0b0001;
-    let Inst{19-8}  = 0b111111111111;
-    let Inst{27-20} = 0b00010010;
-    let Inst{31-28} = 0b1110;
+    bits<4> dst;
+    let Inst{31-4} = 0b1110000100101111111111110001;
+    let Inst{3-0}  = dst;
   }
 
   // ARMV4 only
-  def MOVPCRX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "mov\tpc, $dst",
-                  [(brind GPR:$dst)]>,
-              Requires<[IsARM, NoV4T]> {
-    let Inst{11-4}  = 0b00000000;
-    let Inst{15-12} = 0b1111;
-    let Inst{19-16} = 0b0000;
-    let Inst{27-20} = 0b00011010;
-    let Inst{31-28} = 0b1110;
-  }
+  // FIXME: We would really like to define this as a vanilla ARMPat like:
+  // ARMPat<(brind GPR:$dst), (MOVr PC, GPR:$dst)>
+  // With that, however, we can't set isBranch, isTerminator, etc..
+  def MOVPCRX : ARMPseudoInst<(outs), (ins GPR:$dst),
+                    Size4Bytes, IIC_Br, [(brind GPR:$dst)]>,
+                    Requires<[IsARM, NoV4T]>;
 }
 
-// FIXME: remove when we have a way to marking a MI with these properties.
-// FIXME: Should pc be an implicit operand like PICADD, etc?
-let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
-    hasExtraDefRegAllocReq = 1 in
-  def LDM_RET : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                        reglist:$dsts, variable_ops),
-                       IndexModeUpd, LdStMulFrm, IIC_Br,
-                       "ldm${addr:submode}${p}\t$addr!, $dsts",
-                       "$addr.addr = $wb", []>;
-
-// On non-Darwin platforms R9 is callee-saved.
+// All calls clobber the non-callee saved registers. SP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
 let isCall = 1,
+  // On non-Darwin platforms R9 is callee-saved.
   Defs = [R0,  R1,  R2,  R3,  R12, LR,
           D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
           D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
-  def BL  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                IIC_Br, "bl\t${func:call}",
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Uses = [SP] in {
+  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops),
+                IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
             Requires<[IsARM, IsNotDarwin]> {
     let Inst{31-28} = 0b1110;
+    bits<24> func;
+    let Inst{23-0} = func;
   }
 
-  def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                   IIC_Br, "bl", "\t${func:call}",
+  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops),
+                   IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
-                Requires<[IsARM, IsNotDarwin]>;
+                Requires<[IsARM, IsNotDarwin]> {
+    bits<24> func;
+    let Inst{23-0} = func;
+  }
 
   // ARMv5T and above
   def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
                 IIC_Br, "blx\t$func",
                 [(ARMcall GPR:$func)]>,
             Requires<[IsARM, HasV5T, IsNotDarwin]> {
-    let Inst{7-4}   = 0b0011;
-    let Inst{19-8}  = 0b111111111111;
-    let Inst{27-20} = 0b00010010;
+    bits<4> func;
+    let Inst{31-4} = 0b1110000100101111111111110011;
+    let Inst{3-0}   = func;
   }
 
   // ARMv4T
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
-  def BX : ABXIx2<(outs), (ins tGPR:$func, variable_ops),
-                  IIC_Br, "mov\tlr, pc\n\tbx\t$func",
-                  [(ARMcall_nolink tGPR:$func)]>,
-           Requires<[IsARM, HasV4T, IsNotDarwin]> {
-    let Inst{7-4}   = 0b0001;
-    let Inst{19-8}  = 0b111111111111;
-    let Inst{27-20} = 0b00010010;
-  }
+  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                   Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, HasV4T, IsNotDarwin]>;
 
   // ARMv4
-  def BMOVPCRX : ABXIx2<(outs), (ins tGPR:$func, variable_ops),
-                 IIC_Br, "mov\tlr, pc\n\tmov\tpc, $func",
-                 [(ARMcall_nolink tGPR:$func)]>,
-           Requires<[IsARM, NoV4T, IsNotDarwin]> {
-    let Inst{11-4}  = 0b00000000;
-    let Inst{15-12} = 0b1111;
-    let Inst{19-16} = 0b0000;
-    let Inst{27-20} = 0b00011010;
-  }
+  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                   Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, NoV4T, IsNotDarwin]>;
 }
 
-// On Darwin R9 is call-clobbered.
 let isCall = 1,
+  // On Darwin R9 is call-clobbered.
+  // R7 is marked as a use to prevent frame-pointer assignments from being
+  // moved above / below calls.
   Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
           D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
           D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
-  def BLr9  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                IIC_Br, "bl\t${func:call}",
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Uses = [R7, SP] in {
+  def BLr9  : ABXI<0b1011, (outs), (ins bltarget:$func, variable_ops),
+                IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]> {
     let Inst{31-28} = 0b1110;
+    bits<24> func;
+    let Inst{23-0} = func;
   }
 
-  def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
-                   IIC_Br, "bl", "\t${func:call}",
+  def BLr9_pred : ABI<0b1011, (outs), (ins bltarget:$func, variable_ops),
+                   IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
-                  Requires<[IsARM, IsDarwin]>;
+                  Requires<[IsARM, IsDarwin]> {
+    bits<24> func;
+    let Inst{23-0} = func;
+  }
 
   // ARMv5T and above
   def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
                 IIC_Br, "blx\t$func",
                 [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> {
-    let Inst{7-4}   = 0b0011;
-    let Inst{19-8}  = 0b111111111111;
-    let Inst{27-20} = 0b00010010;
+    bits<4> func;
+    let Inst{31-4} = 0b1110000100101111111111110011;
+    let Inst{3-0}   = func;
   }
 
   // ARMv4T
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
-  def BXr9 : ABXIx2<(outs), (ins tGPR:$func, variable_ops),
-                  IIC_Br, "mov\tlr, pc\n\tbx\t$func",
-                  [(ARMcall_nolink tGPR:$func)]>,
-             Requires<[IsARM, HasV4T, IsDarwin]> {
-    let Inst{7-4}   = 0b0001;
-    let Inst{19-8}  = 0b111111111111;
-    let Inst{27-20} = 0b00010010;
-  }
+  def BXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                  Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                  Requires<[IsARM, HasV4T, IsDarwin]>;
 
   // ARMv4
-  def BMOVPCRXr9 : ABXIx2<(outs), (ins tGPR:$func, variable_ops),
-                 IIC_Br, "mov\tlr, pc\n\tmov\tpc, $func",
-                 [(ARMcall_nolink tGPR:$func)]>,
-           Requires<[IsARM, NoV4T, IsDarwin]> {
-    let Inst{11-4}  = 0b00000000;
-    let Inst{15-12} = 0b1111;
-    let Inst{19-16} = 0b0000;
-    let Inst{27-20} = 0b00011010;
-  }
+  def BMOVPCRXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                  Size8Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                  Requires<[IsARM, NoV4T, IsDarwin]>;
 }
 
 // Tail calls.
 
+// FIXME: These should probably be xformed into the non-TC versions of the
+// instructions as part of MC lowering.
+// FIXME: These seem to be used for both Thumb and ARM instruction selection.
+// Thumb should have its own version since the instruction is actually
+// different, even though the mnemonic is the same.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin versions.
   let Defs = [R0, R1, R2, R3, R9, R12,
@@ -1053,29 +1413,26 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
               D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
               D27, D28, D29, D30, D31, PC],
       Uses = [SP] in {
-    def TCRETURNdi : AInoP<(outs), (ins i32imm:$dst, variable_ops),
-                       Pseudo, IIC_Br,
-                       "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>;
+    def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
+                       IIC_Br, []>, Requires<[IsDarwin]>;
 
-    def TCRETURNri : AInoP<(outs), (ins tcGPR:$dst, variable_ops),
-                       Pseudo, IIC_Br,
-                       "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>;
+    def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
+                       IIC_Br, []>, Requires<[IsDarwin]>;
 
     def TAILJMPd : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
                    IIC_Br, "b\t$dst  @ TAILCALL",
-                   []>, Requires<[IsDarwin]>;
+                   []>, Requires<[IsARM, IsDarwin]>;
 
     def TAILJMPdt: ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
                    IIC_Br, "b.w\t$dst  @ TAILCALL",
-                   []>, Requires<[IsDarwin]>;
+                   []>, Requires<[IsThumb, IsDarwin]>;
 
     def TAILJMPr : AXI<(outs), (ins tcGPR:$dst, variable_ops),
                      BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
                    []>, Requires<[IsDarwin]> {
-                   let Inst{7-4}   = 0b0001;
-                   let Inst{19-8}  = 0b111111111111;
-                   let Inst{27-20} = 0b00010010;
-                   let Inst{31-28} = 0b1110;
+      bits<4> dst;
+      let Inst{31-4} = 0b1110000100101111111111110001;
+      let Inst{3-0}  = dst;
     }
   }
 
@@ -1085,13 +1442,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
               D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
               D27, D28, D29, D30, D31, PC],
       Uses = [SP] in {
-    def TCRETURNdiND : AInoP<(outs), (ins i32imm:$dst, variable_ops),
-                       Pseudo, IIC_Br,
-                       "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>;
+    def TCRETURNdiND : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
+                       IIC_Br, []>, Requires<[IsNotDarwin]>;
 
-    def TCRETURNriND : AInoP<(outs), (ins tcGPR:$dst, variable_ops),
-                       Pseudo, IIC_Br,
-                       "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>;
+    def TCRETURNriND : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
+                       IIC_Br, []>, Requires<[IsNotDarwin]>;
 
     def TAILJMPdND : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
                    IIC_Br, "b\t$dst  @ TAILCALL",
@@ -1104,10 +1459,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
     def TAILJMPrND : AXI<(outs), (ins tcGPR:$dst, variable_ops),
                      BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
                    []>, Requires<[IsNotDarwin]> {
-                   let Inst{7-4}   = 0b0001;
-                   let Inst{19-8}  = 0b111111111111;
-                   let Inst{27-20} = 0b00010010;
-                   let Inst{31-28} = 0b1110;
+      bits<4> dst;
+      let Inst{31-4} = 0b1110000100101111111111110001;
+      let Inst{3-0}  = dst;
     }
   }
 }
@@ -1117,48 +1471,40 @@ let isBranch = 1, isTerminator = 1 in {
   let isBarrier = 1 in {
     let isPredicable = 1 in
     def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br,
-                "b\t$target", [(br bb:$target)]>;
+                "b\t$target", [(br bb:$target)]> {
+      bits<24> target;
+      let Inst{31-28} = 0b1110;
+      let Inst{23-0} = target;
+    }
 
-  let isNotDuplicable = 1, isIndirectBranch = 1 in {
-  def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
-                    IIC_Br, "mov\tpc, $target$jt",
-                    [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> {
-    let Inst{11-4}  = 0b00000000;
-    let Inst{15-12} = 0b1111;
-    let Inst{20}    = 0; // S Bit
-    let Inst{24-21} = 0b1101;
-    let Inst{27-25} = 0b000;
-  }
-  def BR_JTm : JTI<(outs),
-                   (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id),
-                   IIC_Br, "ldr\tpc, $target$jt",
-                   [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
-                     imm:$id)]> {
-    let Inst{15-12} = 0b1111;
-    let Inst{20}    = 1; // L bit
-    let Inst{21}    = 0; // W bit
-    let Inst{22}    = 0; // B bit
-    let Inst{24}    = 1; // P bit
-    let Inst{27-25} = 0b011;
-  }
-  def BR_JTadd : JTI<(outs),
-                   (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
-                    IIC_Br, "add\tpc, $target, $idx$jt",
-                    [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
-                      imm:$id)]> {
-    let Inst{15-12} = 0b1111;
-    let Inst{20}    = 0; // S bit
-    let Inst{24-21} = 0b0100;
-    let Inst{27-25} = 0b000;
-  }
-  } // isNotDuplicable = 1, isIndirectBranch = 1
+    let isNotDuplicable = 1, isIndirectBranch = 1 in {
+    def BR_JTr : ARMPseudoInst<(outs),
+                      (ins GPR:$target, i32imm:$jt, i32imm:$id),
+                      SizeSpecial, IIC_Br,
+                      [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>;
+    // FIXME: This shouldn't use the generic "addrmode2," but rather be split
+    // into i12 and rs suffixed versions.
+    def BR_JTm : ARMPseudoInst<(outs),
+                     (ins addrmode2:$target, i32imm:$jt, i32imm:$id),
+                     SizeSpecial, IIC_Br,
+                     [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
+                       imm:$id)]>;
+    def BR_JTadd : ARMPseudoInst<(outs),
+                   (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id),
+                   SizeSpecial, IIC_Br,
+                   [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
+                     imm:$id)]>;
+    } // isNotDuplicable = 1, isIndirectBranch = 1
   } // isBarrier = 1
 
   // FIXME: should be able to write a pattern for ARMBrcond, but can't use
   // a two-value operand where a dag node expects two operands. :(
-  def Bcc : ABI<0b1010, (outs), (ins brtarget:$target),
+  def Bcc : ABI<0b1010, (outs), (ins br_target:$target),
                IIC_Br, "b", "\t$target",
-               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>;
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> {
+    bits<24> target;
+    let Inst{23-0} = target;
+  }
 }
 
 // Branch and Exchange Jazelle -- for disassembly only
@@ -1172,271 +1518,303 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
 // Secure Monitor Call is a system instruction -- for disassembly only
 def SMC : ABI<0b0001, (outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
               [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0110;
-  let Inst{7-4} = 0b0111;
+  bits<4> opt;
+  let Inst{23-4} = 0b01100000000000000111;
+  let Inst{3-0} = opt;
 }
 
 // Supervisor Call (Software Interrupt) -- for disassembly only
-let isCall = 1 in {
+let isCall = 1, Uses = [SP] in {
 def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc",
-              [/* For disassembly only; pattern left blank */]>;
+              [/* For disassembly only; pattern left blank */]> {
+  bits<24> svc;
+  let Inst{23-0} = svc;
+}
 }
 
 // Store Return State is a system instruction -- for disassembly only
-def SRSW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode),
-                NoItinerary, "srs${addr:submode}\tsp!, $mode",
+let isCodeGenOnly = 1 in {  // FIXME: This should not use submode!
+def SRSW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
+                NoItinerary, "srs${amode}\tsp!, $mode",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b110; // W = 1
 }
 
-def SRS  : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, i32imm:$mode),
-                NoItinerary, "srs${addr:submode}\tsp, $mode",
+def SRS  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
+                NoItinerary, "srs${amode}\tsp, $mode",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b100; // W = 0
 }
 
 // Return From Exception is a system instruction -- for disassembly only
-def RFEW : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base),
-                NoItinerary, "rfe${addr:submode}\t$base!",
+def RFEW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
+                NoItinerary, "rfe${amode}\t$base!",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b011; // W = 1
 }
 
-def RFE  : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base),
-                NoItinerary, "rfe${addr:submode}\t$base",
+def RFE  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
+                NoItinerary, "rfe${amode}\t$base",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b001; // W = 0
 }
+} // isCodeGenOnly = 1
 
 //===----------------------------------------------------------------------===//
 //  Load / store Instructions.
 //
 
 // Load
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def LDR  : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
-               "ldr", "\t$dst, $addr",
-               [(set GPR:$dst, (load addrmode2:$addr))]>;
+
+
+defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
+                    UnOpFrag<(load node:$Src)>>;
+defm LDRB : AI_ldr1<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
+                    UnOpFrag<(zextloadi8 node:$Src)>>;
+defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
+                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm STRB : AI_str1<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
+                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 
 // Special LDR for loads from non-pc-relative constpools.
 let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1,
     isReMaterializable = 1 in
-def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
-                 "ldr", "\t$dst, $addr", []>;
+def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                 AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
+                 []> {
+  bits<4> Rt;
+  bits<17> addr;
+  let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rt;
+  let Inst{11-0}  = addr{11-0};   // imm12
+}
 
 // Loads with zero extension
-def LDRH  : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
-                  IIC_iLoadr, "ldrh", "\t$dst, $addr",
-                  [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
-
-def LDRB  : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm,
-                  IIC_iLoadr, "ldrb", "\t$dst, $addr",
-                  [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
+def LDRH  : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                  IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr",
+                  [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>;
 
 // Loads with sign extension
-def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
-                   IIC_iLoadr, "ldrsh", "\t$dst, $addr",
-                   [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
-
-def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
-                   IIC_iLoadr, "ldrsb", "\t$dst, $addr",
-                   [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
-
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
+    isCodeGenOnly = 1 in { // $dst2 doesn't exist in asmstring?
+// FIXME: $dst2 isn't in the asm string as it's implied by $Rd (dst2 = Rd+1)
+//        how to represent that such that tblgen is happy and we don't
+//        mark this codegen only?
 // Load doubleword
-def LDRD : AI3ldd<(outs GPR:$dst1, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm,
-                 IIC_iLoadr, "ldrd", "\t$dst1, $addr",
+def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
+                 (ins addrmode3:$addr), LdMiscFrm,
+                 IIC_iLoad_d_r, "ldrd", "\t$Rd, $addr",
                  []>, Requires<[IsARM, HasV5TE]>;
+}
 
 // Indexed loads
-def LDR_PRE  : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb),
-                     (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
-                     "ldr", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
-
-def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
-                     (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru,
-                     "ldr", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
-
-def LDRH_PRE  : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb),
-                     (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
-                     "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
-
-def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb),
-                     (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
-                    "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
-
-def LDRB_PRE  : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb),
-                     (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
-                     "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
-
-def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb),
-                     (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru,
-                    "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
-
-def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb),
-                      (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
-                      "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
-
-def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb),
-                      (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
-                   "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
-
-def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb),
-                      (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
-                      "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
-
-def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
-                      (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
-                   "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
+multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> {
+  def _PRE  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins addrmode2:$addr), IndexModePre, LdFrm, itin,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    // {17-14}  Rn
+    // {13}     1 == Rm, 0 == imm12
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<18> addr;
+    let Inst{25} = addr{13};
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{17-14};
+    let Inst{11-0} = addr{11-0};
+  }
+  def _POST : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins GPR:$Rn, am2offset:$offset),
+                      IndexModePost, LdFrm, itin,
+                      opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> {
+    // {13}     1 == Rm, 0 == imm12
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> Rn;
+    let Inst{25} = offset{13};
+    let Inst{23} = offset{12};
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = offset{11-0};
+  }
+}
 
-// For disassembly only
-def LDRD_PRE : AI3lddpr<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb),
-                        (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadr,
-                 "ldrd", "\t$dst1, $dst2, $addr!", "$addr.base = $base_wb", []>,
-                Requires<[IsARM, HasV5TE]>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_ru>;
+defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_ru>;
+}
 
-// For disassembly only
-def LDRD_POST : AI3lddpo<(outs GPR:$dst1, GPR:$dst2, GPR:$base_wb),
-                       (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadr,
-            "ldrd", "\t$dst1, $dst2, [$base], $offset", "$base = $base_wb", []>,
-                Requires<[IsARM, HasV5TE]>;
+multiclass AI3_ldridx<bits<4> op, bit op20, string opc, InstrItinClass itin> {
+  def _PRE  : AI3ldstidx<op, op20, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addrmode3:$addr), IndexModePre,
+                        LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<14> addr;
+    let Inst{23}    = addr{8};      // U bit
+    let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+    let Inst{19-16} = addr{12-9};   // Rn
+    let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+    let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  }
+  def _POST : AI3ldstidx<op, op20, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins GPR:$Rn, am3offset:$offset), IndexModePost,
+                        LdMiscFrm, itin,
+                        opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> {
+    bits<10> offset;
+    bits<4> Rn;
+    let Inst{23}    = offset{8};      // U bit
+    let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+    let Inst{19-16} = Rn;
+    let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+    let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  }
+}
 
-} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LDRH  : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>;
+defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>;
+defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>;
+let hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+defm LDRD :  AI3_ldridx<0b1101, 0, "ldrd", IIC_iLoad_d_ru>;
+} // mayLoad = 1, neverHasSideEffects = 1
 
 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only.
-
-def LDRT : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
-                   (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru,
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$dst, GPR:$base_wb),
+                   (ins GPR:$base, am2offset:$offset), IndexModeNone,
+                   LdFrm, IIC_iLoad_ru,
                    "ldrt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
-
-def LDRBT : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb),
-                  (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru,
+def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                  (ins GPR:$base, am2offset:$offset), IndexModeNone,
+                  LdFrm, IIC_iLoad_bh_ru,
                   "ldrbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
-
-def LDRSBT : AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
-                 (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
+def LDRSBT : AI3ldstidx<0b1101, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                 (ins GPR:$base, am3offset:$offset), IndexModePost,
+                 LdMiscFrm, IIC_iLoad_bh_ru,
                  "ldrsbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
-
-def LDRHT : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb),
-                  (ins GPR:$base, am3offset:$offset), LdMiscFrm, IIC_iLoadru,
-                  "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+def LDRHT : AI3ldstidx<0b1011, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                 (ins GPR:$base, am3offset:$offset), IndexModePost,
+                 LdMiscFrm, IIC_iLoad_bh_ru,
+                 "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
-
-def LDRSHT : AI3ldshpo<(outs GPR:$dst, GPR:$base_wb),
-                 (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
+def LDRSHT : AI3ldstidx<0b1111, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                 (ins GPR:$base, am3offset:$offset), IndexModePost,
+                 LdMiscFrm, IIC_iLoad_bh_ru,
                  "ldrsht", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
+}
 
 // Store
-def STR  : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
-               "str", "\t$src, $addr",
-               [(store GPR:$src, addrmode2:$addr)]>;
 
 // Stores with truncate
-def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm,
-               IIC_iStorer, "strh", "\t$src, $addr",
-               [(truncstorei16 GPR:$src, addrmode3:$addr)]>;
-
-def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
-               "strb", "\t$src, $addr",
-               [(truncstorei8 GPR:$src, addrmode2:$addr)]>;
+def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
+               IIC_iStore_bh_r, "strh", "\t$Rt, $addr",
+               [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
 
 // Store doubleword
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
-def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),
-               StMiscFrm, IIC_iStorer,
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1,
+    isCodeGenOnly = 1 in  // $src2 doesn't exist in asm string
+def STRD : AI3str<0b1111, (outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),
+               StMiscFrm, IIC_iStore_d_r,
                "strd", "\t$src1, $addr", []>, Requires<[IsARM, HasV5TE]>;
 
 // Indexed stores
-def STR_PRE  : AI2stwpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base, am2offset:$offset),
-                     StFrm, IIC_iStoreru,
-                    "str", "\t$src, [$base, $offset]!", "$base = $base_wb",
-                    [(set GPR:$base_wb,
-                      (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>;
-
-def STR_POST : AI2stwpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset),
-                     StFrm, IIC_iStoreru,
-                    "str", "\t$src, [$base], $offset", "$base = $base_wb",
-                    [(set GPR:$base_wb,
-                      (post_store GPR:$src, GPR:$base, am2offset:$offset))]>;
-
-def STRH_PRE : AI3sthpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am3offset:$offset),
-                     StMiscFrm, IIC_iStoreru,
-                     "strh", "\t$src, [$base, $offset]!", "$base = $base_wb",
-                    [(set GPR:$base_wb,
-                      (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>;
-
-def STRH_POST: AI3sthpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am3offset:$offset),
-                     StMiscFrm, IIC_iStoreru,
-                     "strh", "\t$src, [$base], $offset", "$base = $base_wb",
-                    [(set GPR:$base_wb, (post_truncsti16 GPR:$src,
-                                         GPR:$base, am3offset:$offset))]>;
-
-def STRB_PRE : AI2stbpr<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset),
-                     StFrm, IIC_iStoreru,
-                     "strb", "\t$src, [$base, $offset]!", "$base = $base_wb",
-                    [(set GPR:$base_wb, (pre_truncsti8 GPR:$src,
-                                         GPR:$base, am2offset:$offset))]>;
-
-def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset),
-                     StFrm, IIC_iStoreru,
-                     "strb", "\t$src, [$base], $offset", "$base = $base_wb",
-                    [(set GPR:$base_wb, (post_truncsti8 GPR:$src,
-                                         GPR:$base, am2offset:$offset))]>;
+def STR_PRE  : AI2stridx<0, 1, (outs GPR:$Rn_wb),
+                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
+                     IndexModePre, StFrm, IIC_iStore_ru,
+                     "str", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     [(set GPR:$Rn_wb,
+                      (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
+
+def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb),
+                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
+                     IndexModePost, StFrm, IIC_iStore_ru,
+                     "str", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     [(set GPR:$Rn_wb,
+                      (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
+
+def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb),
+                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
+                     IndexModePre, StFrm, IIC_iStore_bh_ru,
+                     "strb", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt,
+                                        GPR:$Rn, am2offset:$offset))]>;
+def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb),
+                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
+                     IndexModePost, StFrm, IIC_iStore_bh_ru,
+                     "strb", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt,
+                                        GPR:$Rn, am2offset:$offset))]>;
+
+def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb),
+                     (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
+                     IndexModePre, StMiscFrm, IIC_iStore_ru,
+                     "strh", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     [(set GPR:$Rn_wb,
+                      (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
+
+def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb),
+                     (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
+                     IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
+                     "strh", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
+                                        GPR:$Rn, am3offset:$offset))]>;
 
 // For disassembly only
 def STRD_PRE : AI3stdpr<(outs GPR:$base_wb),
                      (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
-                     StMiscFrm, IIC_iStoreru,
+                     StMiscFrm, IIC_iStore_d_ru,
                      "strd", "\t$src1, $src2, [$base, $offset]!",
                      "$base = $base_wb", []>;
 
 // For disassembly only
 def STRD_POST: AI3stdpo<(outs GPR:$base_wb),
                      (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
-                     StMiscFrm, IIC_iStoreru,
+                     StMiscFrm, IIC_iStore_d_ru,
                      "strd", "\t$src1, $src2, [$base], $offset",
                      "$base = $base_wb", []>;
 
 // STRT, STRBT, and STRHT are for disassembly only.
 
-def STRT : AI2stwpo<(outs GPR:$base_wb),
-                    (ins GPR:$src, GPR:$base,am2offset:$offset),
-                    StFrm, IIC_iStoreru,
-                    "strt", "\t$src, [$base], $offset", "$base = $base_wb",
+def STRT : AI2stridx<0, 0, (outs GPR:$Rn_wb),
+                    (ins GPR:$Rt, GPR:$Rn,am2offset:$offset),
+                    IndexModeNone, StFrm, IIC_iStore_ru,
+                    "strt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
                     [/* For disassembly only; pattern left blank */]> {
   let Inst{21} = 1; // overwrite
 }
 
-def STRBT : AI2stbpo<(outs GPR:$base_wb),
-                     (ins GPR:$src, GPR:$base,am2offset:$offset),
-                     StFrm, IIC_iStoreru,
-                     "strbt", "\t$src, [$base], $offset", "$base = $base_wb",
+def STRBT : AI2stridx<1, 0, (outs GPR:$Rn_wb),
+                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
+                     IndexModeNone, StFrm, IIC_iStore_bh_ru,
+                     "strbt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
                      [/* For disassembly only; pattern left blank */]> {
   let Inst{21} = 1; // overwrite
 }
 
 def STRHT: AI3sthpo<(outs GPR:$base_wb),
                     (ins GPR:$src, GPR:$base,am3offset:$offset),
-                    StMiscFrm, IIC_iStoreru,
+                    StMiscFrm, IIC_iStore_bh_ru,
                     "strht", "\t$src, [$base], $offset", "$base = $base_wb",
                     [/* For disassembly only; pattern left blank */]> {
   let Inst{21} = 1; // overwrite
@@ -1446,103 +1824,212 @@ def STRHT: AI3sthpo<(outs GPR:$base_wb),
 //  Load / store multiple Instructions.
 //
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
-def LDM : AXI4ld<(outs), (ins addrmode4:$addr, pred:$p,
-                          reglist:$dsts, variable_ops),
-                 IndexModeNone, LdStMulFrm, IIC_iLoadm,
-                 "ldm${addr:submode}${p}\t$addr, $dsts", "", []>;
-
-def LDM_UPD : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                      reglist:$dsts, variable_ops),
-                     IndexModeUpd, LdStMulFrm, IIC_iLoadm,
-                     "ldm${addr:submode}${p}\t$addr!, $dsts",
-                     "$addr.addr = $wb", []>;
-} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq
-
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-def STM : AXI4st<(outs), (ins addrmode4:$addr, pred:$p,
-                          reglist:$srcs, variable_ops),
-                 IndexModeNone, LdStMulFrm, IIC_iStorem,
-                 "stm${addr:submode}${p}\t$addr, $srcs", "", []>;
-
-def STM_UPD : AXI4st<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                      reglist:$srcs, variable_ops),
-                     IndexModeUpd, LdStMulFrm, IIC_iStorem,
-                     "stm${addr:submode}${p}\t$addr!, $srcs",
-                     "$addr.addr = $wb", []>;
-} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq
+multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  def IA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def DA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "da${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "da${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def DB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def IB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "ib${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "ib${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+} 
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm LDM : arm_ldst_mult<"ldm", 1, LdStMulFrm, IIC_iLoad_m, IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm STM : arm_ldst_mult<"stm", 0, LdStMulFrm, IIC_iStore_m, IIC_iStore_mu>;
+
+} // neverHasSideEffects
+
+// Load / Store Multiple Mnemonic Aliases
+def : MnemonicAlias<"ldm", "ldmia">;
+def : MnemonicAlias<"stm", "stmia">;
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+// FIXME: Should be a pseudo-instruction.
+def LDMIA_RET : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                      reglist:$regs, variable_ops),
+                     IndexModeUpd, LdStMulFrm, IIC_iLoad_mBr,
+                     "ldmia${p}\t$Rn!, $regs",
+                     "$Rn = $wb", []> {
+  let Inst{24-23} = 0b01;       // Increment After
+  let Inst{21}    = 1;          // Writeback
+  let Inst{20}    = 1;          // Load
+}
 
 //===----------------------------------------------------------------------===//
 //  Move Instructions.
 //
 
 let neverHasSideEffects = 1 in
-def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
-                "mov", "\t$dst, $src", []>, UnaryDP {
+def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
+                "mov", "\t$Rd, $Rm", []>, UnaryDP {
+  bits<4> Rd;
+  bits<4> Rm;
+
   let Inst{11-4} = 0b00000000;
   let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
 }
 
 // A version for the smaller set of tail call registers.
 let neverHasSideEffects = 1 in
-def MOVr_TC : AsI1<0b1101, (outs tcGPR:$dst), (ins tcGPR:$src), DPFrm, 
-                IIC_iMOVr, "mov", "\t$dst, $src", []>, UnaryDP {
+def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
+                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP {
+  bits<4> Rd;
+  bits<4> Rm;
+
   let Inst{11-4} = 0b00000000;
   let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
 }
 
-def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src),
+def MOVs : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg:$src),
                 DPSoRegFrm, IIC_iMOVsr,
-                "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP {
+                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg:$src)]>,
+                UnaryDP {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = src;
   let Inst{25} = 0;
 }
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi,
-                "mov", "\t$dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
+                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> imm;
   let Inst{25} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-0} = imm;
 }
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src),
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins i32imm_hilo16:$imm),
                  DPFrm, IIC_iMOVi,
-                 "movw", "\t$dst, $src",
-                 [(set GPR:$dst, imm0_65535:$src)]>,
+                 "movw", "\t$Rd, $imm",
+                 [(set GPR:$Rd, imm0_65535:$imm)]>,
                  Requires<[IsARM, HasV6T2]>, UnaryDP {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
   let Inst{20} = 0;
   let Inst{25} = 1;
 }
 
-let Constraints = "$src = $dst" in
-def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
+def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+let Constraints = "$src = $Rd" in {
+def MOVTi16 : AI1<0b1010, (outs GPR:$Rd), (ins GPR:$src, i32imm_hilo16:$imm),
                   DPFrm, IIC_iMOVi,
-                  "movt", "\t$dst, $imm",
-                  [(set GPR:$dst,
+                  "movt", "\t$Rd, $imm",
+                  [(set GPR:$Rd,
                         (or (and GPR:$src, 0xffff),
                             lo16AllZero:$imm))]>, UnaryDP,
                   Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
   let Inst{20} = 0;
   let Inst{25} = 1;
 }
 
+def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+} // Constraints
+
 def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
       Requires<[IsARM, HasV6T2]>;
 
 let Uses = [CPSR] in
-def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi,
-                 "mov", "\t$dst, $src, rrx",
-                 [(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP;
+def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
+                    [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
+                    Requires<[IsARM]>;
 
 // These aren't really mov instructions, but we have to define them this way
 // due to flag operands.
 
 let Defs = [CPSR] in {
-def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
-                      IIC_iMOVsi, "movs", "\t$dst, $src, lsr #1",
-                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP;
-def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
-                      IIC_iMOVsi, "movs", "\t$dst, $src, asr #1",
-                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP;
+def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
+                      Requires<[IsARM]>;
+def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
+                      Requires<[IsARM]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1551,31 +2038,31 @@ def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
 
 // Sign extenders
 
-defm SXTB  : AI_unary_rrot<0b01101010,
-                           "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
-defm SXTH  : AI_unary_rrot<0b01101011,
-                           "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+defm SXTB  : AI_ext_rrot<0b01101010,
+                         "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+defm SXTH  : AI_ext_rrot<0b01101011,
+                         "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
 
-defm SXTAB : AI_bin_rrot<0b01101010,
+defm SXTAB : AI_exta_rrot<0b01101010,
                "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-defm SXTAH : AI_bin_rrot<0b01101011,
+defm SXTAH : AI_exta_rrot<0b01101011,
                "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
 
 // For disassembly only
-defm SXTB16  : AI_unary_rrot_np<0b01101000, "sxtb16">;
+defm SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
 
 // For disassembly only
-defm SXTAB16 : AI_bin_rrot_np<0b01101000, "sxtab16">;
+defm SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
 
 // Zero extenders
 
 let AddedComplexity = 16 in {
-defm UXTB   : AI_unary_rrot<0b01101110,
-                            "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
-defm UXTH   : AI_unary_rrot<0b01101111,
-                            "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm UXTB16 : AI_unary_rrot<0b01101100,
-                            "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+defm UXTB   : AI_ext_rrot<0b01101110,
+                          "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+defm UXTH   : AI_ext_rrot<0b01101111,
+                          "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+defm UXTB16 : AI_ext_rrot<0b01101100,
+                          "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
 //        The transformation should probably be done as a combiner action
@@ -1586,33 +2073,49 @@ defm UXTB16 : AI_unary_rrot<0b01101100,
 def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
                (UXTB16r_rot GPR:$Src, 8)>;
 
-defm UXTAB : AI_bin_rrot<0b01101110, "uxtab",
+defm UXTAB : AI_exta_rrot<0b01101110, "uxtab",
                         BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-defm UXTAH : AI_bin_rrot<0b01101111, "uxtah",
+defm UXTAH : AI_exta_rrot<0b01101111, "uxtah",
                         BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
 }
 
 // This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
 // For disassembly only
-defm UXTAB16 : AI_bin_rrot_np<0b01101100, "uxtab16">;
+defm UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
 
 
-def SBFX  : I<(outs GPR:$dst),
-              (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
-               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
-               "sbfx", "\t$dst, $src, $lsb, $width", "", []>,
+def SBFX  : I<(outs GPR:$Rd),
+              (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
                Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
   let Inst{27-21} = 0b0111101;
   let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
 }
 
-def UBFX  : I<(outs GPR:$dst),
-              (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
-               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
-               "ubfx", "\t$dst, $src, $lsb, $width", "", []>,
+def UBFX  : I<(outs GPR:$Rd),
+              (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
                Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
   let Inst{27-21} = 0b0111111;
   let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1620,100 +2123,166 @@ def UBFX  : I<(outs GPR:$dst),
 //
 
 defm ADD  : AsI1_bin_irs<0b0100, "add",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
                          BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
 defm SUB  : AsI1_bin_irs<0b0010, "sub",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
                          BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
 
 // ADD and SUB with 's' bit set.
 defm ADDS : AI1_bin_s_irs<0b0100, "adds",
+                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
                           BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
 defm SUBS : AI1_bin_s_irs<0b0010, "subs",
+                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
                           BinOpFrag<(subc node:$LHS, node:$RHS)>>;
 
 defm ADC : AI1_adde_sube_irs<0b0101, "adc",
                           BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>;
 defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
                           BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
+
+// ADC and SUBC with 's' bit set.
 defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs",
                           BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
 defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs",
                           BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>;
 
-def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-                 IIC_iALUi, "rsb", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> {
-    let Inst{25} = 1;
+def RSBri : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+                 IIC_iALUi, "rsb", "\t$Rd, $Rn, $imm",
+                 [(set GPR:$Rd, (sub so_imm:$imm, GPR:$Rn))]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = imm;
 }
 
 // The reg/reg form is only defined for the disassembler; for codegen it is
 // equivalent to SUBrr.
-def RSBrr : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
-                 IIC_iALUr, "rsb", "\t$dst, $a, $b",
+def RSBrr : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+                 IIC_iALUr, "rsb", "\t$Rd, $Rn, $Rm",
                  [/* For disassembly only; pattern left blank */]> {
-    let Inst{25} = 0;
-    let Inst{11-4} = 0b00000000;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
 }
 
-def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-                 IIC_iALUsr, "rsb", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]> {
-    let Inst{25} = 0;
+def RSBrs : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                 DPSoRegFrm, IIC_iALUsr, "rsb", "\t$Rd, $Rn, $shift",
+                 [(set GPR:$Rd, (sub so_reg:$shift, GPR:$Rn))]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{11-0} = shift;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
 }
 
 // RSB with 's' bit set.
-let Defs = [CPSR] in {
-def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-                 IIC_iALUi, "rsbs", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]> {
-    let Inst{20} = 1;
-    let Inst{25} = 1;
-}
-def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-                 IIC_iALUsr, "rsbs", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]> {
-    let Inst{20} = 1;
-    let Inst{25} = 0;
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+def RSBSri : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+                 IIC_iALUi, "rsbs", "\t$Rd, $Rn, $imm",
+                 [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = imm;
+}
+def RSBSrs : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                 DPSoRegFrm, IIC_iALUsr, "rsbs", "\t$Rd, $Rn, $shift",
+                 [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{11-0} = shift;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
 }
 }
 
 let Uses = [CPSR] in {
-def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                 DPFrm, IIC_iALUi, "rsc", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>,
+def RSCri : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                 DPFrm, IIC_iALUi, "rsc", "\t$Rd, $Rn, $imm",
+                 [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>,
                  Requires<[IsARM]> {
-    let Inst{25} = 1;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = imm;
 }
 // The reg/reg form is only defined for the disassembler; for codegen it is
 // equivalent to SUBrr.
-def RSCrr : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                 DPFrm, IIC_iALUr, "rsc", "\t$dst, $a, $b",
+def RSCrr : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                 DPFrm, IIC_iALUr, "rsc", "\t$Rd, $Rn, $Rm",
                  [/* For disassembly only; pattern left blank */]> {
-    let Inst{25} = 0;
-    let Inst{11-4} = 0b00000000;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
 }
-def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                 DPSoRegFrm, IIC_iALUsr, "rsc", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>,
+def RSCrs : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                 DPSoRegFrm, IIC_iALUsr, "rsc", "\t$Rd, $Rn, $shift",
+                 [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>,
                  Requires<[IsARM]> {
-    let Inst{25} = 0;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{11-0} = shift;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
 }
 }
 
 // FIXME: Allow these to be predicated.
-let Defs = [CPSR], Uses = [CPSR] in {
-def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
-                  DPFrm, IIC_iALUi, "rscs\t$dst, $a, $b",
-                  [(set GPR:$dst, (sube_dead_carry so_imm:$b, GPR:$a))]>,
+let isCodeGenOnly = 1, Defs = [CPSR], Uses = [CPSR] in {
+def RSCSri : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                  DPFrm, IIC_iALUi, "rscs\t$Rd, $Rn, $imm",
+                  [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>,
                   Requires<[IsARM]> {
-    let Inst{20} = 1;
-    let Inst{25} = 1;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = imm;
 }
-def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
-                  DPSoRegFrm, IIC_iALUsr, "rscs\t$dst, $a, $b",
-                  [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>,
+def RSCSrs : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                  DPSoRegFrm, IIC_iALUsr, "rscs\t$Rd, $Rn, $shift",
+                  [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>,
                   Requires<[IsARM]> {
-    let Inst{20} = 1;
-    let Inst{25} = 0;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{11-0} = shift;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
 }
 }
 
@@ -1740,111 +2309,166 @@ def : ARMPat<(adde   GPR:$src, so_imm_not:$imm),
 
 // ARM Arithmetic Instruction -- for disassembly only
 // GPR:$dst = GPR:$a op GPR:$b
-class AAI<bits<8> op27_20, bits<4> op7_4, string opc,
-          list<dag> pattern = [/* For disassembly only; pattern left blank */]>
-  : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, IIC_iALUr,
-       opc, "\t$dst, $a, $b", pattern> {
+class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
+          list<dag> pattern = [/* For disassembly only; pattern left blank */],
+          dag iops = (ins GPR:$Rn, GPR:$Rm), string asm = "\t$Rd, $Rn, $Rm">
+  : AI<(outs GPR:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<4> Rd;
+  bits<4> Rm;
   let Inst{27-20} = op27_20;
-  let Inst{7-4} = op7_4;
+  let Inst{11-4} = op11_4;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
 }
 
 // Saturating add/subtract -- for disassembly only
 
-def QADD    : AAI<0b00010000, 0b0101, "qadd",
-                  [(set GPR:$dst, (int_arm_qadd GPR:$a, GPR:$b))]>;
-def QADD16  : AAI<0b01100010, 0b0001, "qadd16">;
-def QADD8   : AAI<0b01100010, 0b1001, "qadd8">;
-def QASX    : AAI<0b01100010, 0b0011, "qasx">;
-def QDADD   : AAI<0b00010100, 0b0101, "qdadd">;
-def QDSUB   : AAI<0b00010110, 0b0101, "qdsub">;
-def QSAX    : AAI<0b01100010, 0b0101, "qsax">;
-def QSUB    : AAI<0b00010010, 0b0101, "qsub",
-                  [(set GPR:$dst, (int_arm_qsub GPR:$a, GPR:$b))]>;
-def QSUB16  : AAI<0b01100010, 0b0111, "qsub16">;
-def QSUB8   : AAI<0b01100010, 0b1111, "qsub8">;
-def UQADD16 : AAI<0b01100110, 0b0001, "uqadd16">;
-def UQADD8  : AAI<0b01100110, 0b1001, "uqadd8">;
-def UQASX   : AAI<0b01100110, 0b0011, "uqasx">;
-def UQSAX   : AAI<0b01100110, 0b0101, "uqsax">;
-def UQSUB16 : AAI<0b01100110, 0b0111, "uqsub16">;
-def UQSUB8  : AAI<0b01100110, 0b1111, "uqsub8">;
+def QADD    : AAI<0b00010000, 0b00000101, "qadd",
+                  [(set GPR:$Rd, (int_arm_qadd GPR:$Rm, GPR:$Rn))],
+                  (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
+                  [(set GPR:$Rd, (int_arm_qsub GPR:$Rm, GPR:$Rn))],
+                  (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [], (ins GPR:$Rm, GPR:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [], (ins GPR:$Rm, GPR:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+
+def QADD16  : AAI<0b01100010, 0b11110001, "qadd16">;
+def QADD8   : AAI<0b01100010, 0b11111001, "qadd8">;
+def QASX    : AAI<0b01100010, 0b11110011, "qasx">;
+def QSAX    : AAI<0b01100010, 0b11110101, "qsax">;
+def QSUB16  : AAI<0b01100010, 0b11110111, "qsub16">;
+def QSUB8   : AAI<0b01100010, 0b11111111, "qsub8">;
+def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">;
+def UQADD8  : AAI<0b01100110, 0b11111001, "uqadd8">;
+def UQASX   : AAI<0b01100110, 0b11110011, "uqasx">;
+def UQSAX   : AAI<0b01100110, 0b11110101, "uqsax">;
+def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
+def UQSUB8  : AAI<0b01100110, 0b11111111, "uqsub8">;
 
 // Signed/Unsigned add/subtract -- for disassembly only
 
-def SASX   : AAI<0b01100001, 0b0011, "sasx">;
-def SADD16 : AAI<0b01100001, 0b0001, "sadd16">;
-def SADD8  : AAI<0b01100001, 0b1001, "sadd8">;
-def SSAX   : AAI<0b01100001, 0b0101, "ssax">;
-def SSUB16 : AAI<0b01100001, 0b0111, "ssub16">;
-def SSUB8  : AAI<0b01100001, 0b1111, "ssub8">;
-def UASX   : AAI<0b01100101, 0b0011, "uasx">;
-def UADD16 : AAI<0b01100101, 0b0001, "uadd16">;
-def UADD8  : AAI<0b01100101, 0b1001, "uadd8">;
-def USAX   : AAI<0b01100101, 0b0101, "usax">;
-def USUB16 : AAI<0b01100101, 0b0111, "usub16">;
-def USUB8  : AAI<0b01100101, 0b1111, "usub8">;
+def SASX   : AAI<0b01100001, 0b11110011, "sasx">;
+def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
+def SADD8  : AAI<0b01100001, 0b11111001, "sadd8">;
+def SSAX   : AAI<0b01100001, 0b11110101, "ssax">;
+def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">;
+def SSUB8  : AAI<0b01100001, 0b11111111, "ssub8">;
+def UASX   : AAI<0b01100101, 0b11110011, "uasx">;
+def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">;
+def UADD8  : AAI<0b01100101, 0b11111001, "uadd8">;
+def USAX   : AAI<0b01100101, 0b11110101, "usax">;
+def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
+def USUB8  : AAI<0b01100101, 0b11111111, "usub8">;
 
 // Signed/Unsigned halving add/subtract -- for disassembly only
 
-def SHASX   : AAI<0b01100011, 0b0011, "shasx">;
-def SHADD16 : AAI<0b01100011, 0b0001, "shadd16">;
-def SHADD8  : AAI<0b01100011, 0b1001, "shadd8">;
-def SHSAX   : AAI<0b01100011, 0b0101, "shsax">;
-def SHSUB16 : AAI<0b01100011, 0b0111, "shsub16">;
-def SHSUB8  : AAI<0b01100011, 0b1111, "shsub8">;
-def UHASX   : AAI<0b01100111, 0b0011, "uhasx">;
-def UHADD16 : AAI<0b01100111, 0b0001, "uhadd16">;
-def UHADD8  : AAI<0b01100111, 0b1001, "uhadd8">;
-def UHSAX   : AAI<0b01100111, 0b0101, "uhsax">;
-def UHSUB16 : AAI<0b01100111, 0b0111, "uhsub16">;
-def UHSUB8  : AAI<0b01100111, 0b1111, "uhsub8">;
+def SHASX   : AAI<0b01100011, 0b11110011, "shasx">;
+def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
+def SHADD8  : AAI<0b01100011, 0b11111001, "shadd8">;
+def SHSAX   : AAI<0b01100011, 0b11110101, "shsax">;
+def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">;
+def SHSUB8  : AAI<0b01100011, 0b11111111, "shsub8">;
+def UHASX   : AAI<0b01100111, 0b11110011, "uhasx">;
+def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">;
+def UHADD8  : AAI<0b01100111, 0b11111001, "uhadd8">;
+def UHSAX   : AAI<0b01100111, 0b11110101, "uhsax">;
+def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
+def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
 
 // Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
 
-def USAD8  : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b),
+def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 MulFrm /* for convenience */, NoItinerary, "usad8",
-                "\t$dst, $a, $b", []>,
+                "\t$Rd, $Rn, $Rm", []>,
              Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
   let Inst{27-20} = 0b01111000;
   let Inst{15-12} = 0b1111;
   let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
 }
-def USADA8 : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                 MulFrm /* for convenience */, NoItinerary, "usada8",
-                "\t$dst, $a, $b, $acc", []>,
+                "\t$Rd, $Rn, $Rm, $Ra", []>,
              Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
   let Inst{27-20} = 0b01111000;
   let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
 }
 
 // Signed/Unsigned saturate -- for disassembly only
 
-def SSAT : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, shift_imm:$sh),
-              SatFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a$sh",
+def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $a$sh",
               [/* For disassembly only; pattern left blank */]> {
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
   let Inst{27-21} = 0b0110101;
   let Inst{5-4} = 0b01;
+  let Inst{20-16} = sat_imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{7-3};
+  let Inst{6} = sh{0};
+  let Inst{3-0} = Rn;
 }
 
-def SSAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), SatFrm,
-                NoItinerary, "ssat16", "\t$dst, $bit_pos, $a",
+def SSAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$Rn), SatFrm,
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn",
                 [/* For disassembly only; pattern left blank */]> {
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
   let Inst{27-20} = 0b01101010;
-  let Inst{7-4} = 0b0011;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
 }
 
-def USAT : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, shift_imm:$sh),
-              SatFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a$sh",
+def USAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $a$sh",
               [/* For disassembly only; pattern left blank */]> {
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
   let Inst{27-21} = 0b0110111;
   let Inst{5-4} = 0b01;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{7-3};
+  let Inst{6} = sh{0};
+  let Inst{20-16} = sat_imm;
+  let Inst{3-0} = Rn;
 }
 
-def USAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), SatFrm,
-                NoItinerary, "usat16", "\t$dst, $bit_pos, $a",
+def USAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a), SatFrm,
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $a",
                 [/* For disassembly only; pattern left blank */]> {
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
   let Inst{27-20} = 0b01101110;
-  let Inst{7-4} = 0b0011;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
 }
 
 def : ARMV6Pat<(int_arm_ssat GPR:$a, imm:$pos), (SSAT imm:$pos, GPR:$a, 0)>;
@@ -1855,52 +2479,100 @@ def : ARMV6Pat<(int_arm_usat GPR:$a, imm:$pos), (USAT imm:$pos, GPR:$a, 0)>;
 //
 
 defm AND   : AsI1_bin_irs<0b0000, "and",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
-defm ANDS  : AI1_bin_s_irs<0b0000, "and",
-                           BinOpFrag<(ARMand node:$LHS, node:$RHS)>, 1>;
 defm ORR   : AsI1_bin_irs<0b1100, "orr",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
 defm EOR   : AsI1_bin_irs<0b0001, "eor",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
 defm BIC   : AsI1_bin_irs<0b1110, "bic",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
-def BFC    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
                AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
-               "bfc", "\t$dst, $imm", "$src = $dst",
-               [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+               "bfc", "\t$Rd, $imm", "$src = $Rd",
+               [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
                Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<10> imm;
   let Inst{27-21} = 0b0111110;
   let Inst{6-0}   = 0b0011111;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // width
 }
 
 // A8.6.18  BFI - Bitfield insert (Encoding A1)
-def BFI    : I<(outs GPR:$dst), (ins GPR:$src, GPR:$val, bf_inv_mask_imm:$imm),
+def BFI    : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
                AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
-               "bfi", "\t$dst, $val, $imm", "$src = $dst",
-               [(set GPR:$dst, (ARMbfi GPR:$src, GPR:$val,
+               "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
+               [(set GPR:$Rd, (ARMbfi GPR:$src, GPR:$Rn,
                                 bf_inv_mask_imm:$imm))]>,
                Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<10> imm;
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // width
+  let Inst{3-0}   = Rn;
+}
+
+// GNU as only supports this form of bfi (w/ 4 arguments)
+let isAsmParserOnly = 1 in
+def BFI4p : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn,
+                                   lsb_pos_imm:$lsb, width_imm:$width),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "bfi", "\t$Rd, $Rn, $lsb, $width", "$src = $Rd",
+               []>, Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
   let Inst{27-21} = 0b0111110;
   let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{20-16} = width; // Custom encoder => lsb+width-1
+  let Inst{3-0}   = Rn;
 }
 
-def  MVNr  : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
-                  "mvn", "\t$dst, $src",
-                  [(set GPR:$dst, (not GPR:$src))]>, UnaryDP {
+def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
+                  "mvn", "\t$Rd, $Rm",
+                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP {
+  bits<4> Rd;
+  bits<4> Rm;
   let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
   let Inst{11-4} = 0b00000000;
-}
-def  MVNs  : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
-                  IIC_iMOVsr, "mvn", "\t$dst, $src",
-                  [(set GPR:$dst, (not so_reg:$src))]>, UnaryDP {
+  let Inst{15-12} = Rd;
+  let Inst{3-0} = Rm;
+}
+def  MVNs  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg:$shift), DPSoRegFrm,
+                  IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg:$shift))]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> shift;
   let Inst{25} = 0;
-}
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def  MVNi  : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm,
-                  IIC_iMOVi, "mvn", "\t$dst, $imm",
-                  [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP {
-    let Inst{25} = 1;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = shift;
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
+                  IIC_iMVNi, "mvn", "\t$Rd, $imm",
+                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = imm;
 }
 
 def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
@@ -1909,247 +2581,299 @@ def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
 //===----------------------------------------------------------------------===//
 //  Multiply Instructions.
 //
+class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
 
-let isCommutable = 1 in
-def MUL   : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                   IIC_iMUL32, "mul", "\t$dst, $a, $b",
-                   [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
-
-def MLA   : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-                    IIC_iMAC32, "mla", "\t$dst, $a, $b, $c",
-                   [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
-
-def MLS   : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-                   IIC_iMAC32, "mls", "\t$dst, $a, $b, $c",
-                   [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>,
-                   Requires<[IsARM, HasV6T2]>;
+let isCommutable = 1 in {
+let Constraints = "@earlyclobber $Rd" in
+def MULv5: ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
+                                          pred:$p, cc_out:$s),
+                          Size4Bytes, IIC_iMUL32,
+                         [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>,
+                        Requires<[IsARM, NoV6]>;
+
+def MUL  : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                   IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>,
+                   Requires<[IsARM, HasV6]>;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MLAv5: ARMPseudoInst<(outs GPR:$Rd),
+                         (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
+                         Size4Bytes, IIC_iMAC32, 
+                         [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, 
+                        Requires<[IsARM, NoV6]> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                    IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
+                   [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+                   Requires<[IsARM, HasV6]> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+
+def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                   IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
+                   [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
+                   Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<4> Ra;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
 
 // Extra precision multiplies with low / high results
+
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
-def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iMUL64,
-                    "smull", "\t$ldst, $hdst, $a, $b", []>;
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+def SMULLv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            Size4Bytes, IIC_iMUL64, []>,
+                           Requires<[IsARM, NoV6]>;
 
-def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iMUL64,
-                    "umull", "\t$ldst, $hdst, $a, $b", []>;
+def UMULLv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            Size4Bytes, IIC_iMUL64, []>,
+                           Requires<[IsARM, NoV6]>;
 }
 
-// Multiply + accumulate
-def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
-                    "smlal", "\t$ldst, $hdst, $a, $b", []>;
+def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
 
-def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
-                    "umlal", "\t$ldst, $hdst, $a, $b", []>;
+def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+}
 
-def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
-                    "umaal", "\t$ldst, $hdst, $a, $b", []>,
+// Multiply + accumulate
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+def SMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            Size4Bytes, IIC_iMAC64, []>,
+                           Requires<[IsARM, NoV6]>;
+def UMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            Size4Bytes, IIC_iMAC64, []>,
+                           Requires<[IsARM, NoV6]>;
+def UMAALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            Size4Bytes, IIC_iMAC64, []>,
+                           Requires<[IsARM, NoV6]>;
+
+}
+
+def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+                    "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+                    "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
                     Requires<[IsARM, HasV6]>;
+
+def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+                    "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdLo;
+  let Inst{15-12} = RdHi;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
 } // neverHasSideEffects
 
 // Most significant word multiply
-def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-               IIC_iMUL32, "smmul", "\t$dst, $a, $b",
-               [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>,
+def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
             Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b0001;
   let Inst{15-12} = 0b1111;
 }
 
-def SMMULR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-               IIC_iMUL32, "smmulr", "\t$dst, $a, $b",
+def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm",
                [/* For disassembly only; pattern left blank */]>,
             Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b0011; // R = 1
   let Inst{15-12} = 0b1111;
 }
 
-def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               IIC_iMAC32, "smmla", "\t$dst, $a, $b, $c",
-               [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
-            Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b0001;
-}
+def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+            Requires<[IsARM, HasV6]>;
 
-def SMMLAR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               IIC_iMAC32, "smmlar", "\t$dst, $a, $b, $c",
+def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra",
                [/* For disassembly only; pattern left blank */]>,
-            Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b0011; // R = 1
-}
+            Requires<[IsARM, HasV6]>;
 
-def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               IIC_iMAC32, "smmls", "\t$dst, $a, $b, $c",
-               [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>,
-            Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b1101;
-}
+def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (sub GPR:$Ra, (mulhs GPR:$Rn, GPR:$Rm)))]>,
+            Requires<[IsARM, HasV6]>;
 
-def SMMLSR : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               IIC_iMAC32, "smmlsr", "\t$dst, $a, $b, $c",
+def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra",
                [/* For disassembly only; pattern left blank */]>,
-            Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b1111; // R = 1
-}
+            Requires<[IsARM, HasV6]>;
 
 multiclass AI_smul<string opc, PatFrag opnode> {
-  def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iMUL32, !strconcat(opc, "bb"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
-                                      (sext_inreg GPR:$b, i16)))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 0;
-             let Inst{6} = 0;
-           }
-
-  def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iMUL32, !strconcat(opc, "bt"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
-                                      (sra GPR:$b, (i32 16))))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 0;
-             let Inst{6} = 1;
-           }
-
-  def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iMUL32, !strconcat(opc, "tb"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
-                                      (sext_inreg GPR:$b, i16)))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 1;
-             let Inst{6} = 0;
-           }
-
-  def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iMUL32, !strconcat(opc, "tt"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
-                                      (sra GPR:$b, (i32 16))))]>,
-            Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 1;
-             let Inst{6} = 1;
-           }
-
-  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iMUL16, !strconcat(opc, "wb"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sext_inreg GPR:$b, i16)), (i32 16)))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 1;
-             let Inst{6} = 0;
-           }
-
-  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iMUL16, !strconcat(opc, "wt"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sra GPR:$b, (i32 16))), (i32 16)))]>,
-            Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 1;
-             let Inst{6} = 1;
-           }
+  def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+            Requires<[IsARM, HasV5TE]>;
+
+  def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
+                                    (sext_inreg GPR:$Rm, i16)), (i32 16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
+                                    (sra GPR:$Rm, (i32 16))), (i32 16)))]>,
+            Requires<[IsARM, HasV5TE]>;
 }
 
 
 multiclass AI_smla<string opc, PatFrag opnode> {
-  def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iMAC16, !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc,
-                               (opnode (sext_inreg GPR:$a, i16),
-                                       (sext_inreg GPR:$b, i16))))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 0;
-             let Inst{6} = 0;
-           }
-
-  def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iMAC16, !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
-                                                    (sra GPR:$b, (i32 16)))))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 0;
-             let Inst{6} = 1;
-           }
-
-  def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iMAC16, !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
-                                                 (sext_inreg GPR:$b, i16))))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 1;
-             let Inst{6} = 0;
-           }
-
-  def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iMAC16, !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc",
-             [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
-                                                    (sra GPR:$b, (i32 16)))))]>,
-            Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 1;
-             let Inst{6} = 1;
-           }
-
-  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iMAC16, !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                       (sext_inreg GPR:$b, i16)), (i32 16))))]>,
-           Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 0;
-             let Inst{6} = 0;
-           }
-
-  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iMAC16, !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                         (sra GPR:$b, (i32 16))), (i32 16))))]>,
-            Requires<[IsARM, HasV5TE]> {
-             let Inst{5} = 0;
-             let Inst{6} = 1;
-           }
+  def BB : AMulxyIa<0b0001000, 0b00, (outs GPR:$Rd),
+              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPR:$Rd, (add GPR:$Ra,
+                               (opnode (sext_inreg GPR:$Rn, i16),
+                                       (sext_inreg GPR:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def BT : AMulxyIa<0b0001000, 0b10, (outs GPR:$Rd),
+              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPR:$Rd, (add GPR:$Ra, (opnode (sext_inreg GPR:$Rn, i16),
+                                                   (sra GPR:$Rm, (i32 16)))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TB : AMulxyIa<0b0001000, 0b01, (outs GPR:$Rd),
+              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)),
+                                                (sext_inreg GPR:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TT : AMulxyIa<0b0001000, 0b11, (outs GPR:$Rd),
+              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)),
+                                                   (sra GPR:$Rm, (i32 16)))))]>,
+            Requires<[IsARM, HasV5TE]>;
+
+  def WB : AMulxyIa<0b0001001, 0b00, (outs GPR:$Rd),
+              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn,
+                                      (sext_inreg GPR:$Rm, i16)), (i32 16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def WT : AMulxyIa<0b0001001, 0b10, (outs GPR:$Rd),
+              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn,
+                                        (sra GPR:$Rm, (i32 16))), (i32 16))))]>,
+            Requires<[IsARM, HasV5TE]>;
 }
 
 defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
 // Halfword multiply accumulate long: SMLAL<x><y> -- for disassembly only
-def SMLALBB : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b),
-                      IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b",
+def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPR:$RdLo, GPR:$RdHi),
+                      (ins GPR:$Rn, GPR:$Rm),
+                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm",
                       [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV5TE]> {
-  let Inst{5} = 0;
-  let Inst{6} = 0;
-}
+              Requires<[IsARM, HasV5TE]>;
 
-def SMLALBT : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b),
-                      IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b",
+def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPR:$RdLo, GPR:$RdHi),
+                      (ins GPR:$Rn, GPR:$Rm),
+                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm",
                       [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV5TE]> {
-  let Inst{5} = 0;
-  let Inst{6} = 1;
-}
+              Requires<[IsARM, HasV5TE]>;
 
-def SMLALTB : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b),
-                      IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b",
+def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPR:$RdLo, GPR:$RdHi),
+                      (ins GPR:$Rn, GPR:$Rm),
+                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm",
                       [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV5TE]> {
-  let Inst{5} = 1;
-  let Inst{6} = 0;
-}
+              Requires<[IsARM, HasV5TE]>;
 
-def SMLALTT : AMulxyI<0b0001010,(outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b),
-                      IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b",
+def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPR:$RdLo, GPR:$RdHi),
+                      (ins GPR:$Rn, GPR:$Rm),
+                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm",
                       [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV5TE]> {
-  let Inst{5} = 1;
-  let Inst{6} = 1;
-}
+              Requires<[IsARM, HasV5TE]>;
 
 // Helper class for AI_smld -- for disassembly only
-class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
-                InstrItinClass itin, string opc, string asm>
+class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
+                    InstrItinClass itin, string opc, string asm>
   : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
+  bits<4> Rn;
+  bits<4> Rm;
   let Inst{4}     = 1;
   let Inst{5}     = swap;
   let Inst{6}     = sub;
@@ -2157,21 +2881,46 @@ class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
   let Inst{21-20} = 0b00;
   let Inst{22}    = long;
   let Inst{27-23} = 0b01110;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{19-16} = Rd;
+}
+class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
 }
 
 multiclass AI_smld<bit sub, string opc> {
 
-  def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-                  NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b, $acc">;
+  def D : AMulDualIa<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
 
-  def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-                  NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b, $acc">;
+  def DX: AMulDualIa<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
 
-  def LD : AMulDualI<1, sub, 0, (outs GPR:$ldst,GPR:$hdst), (ins GPR:$a,GPR:$b),
-                  NoItinerary, !strconcat(opc, "ld"), "\t$ldst, $hdst, $a, $b">;
+  def LD: AMulDualI64<1, sub, 0, (outs GPR:$RdLo,GPR:$RdHi),
+                  (ins GPR:$Rn, GPR:$Rm), NoItinerary,
+                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
 
-  def LDX : AMulDualI<1, sub, 1, (outs GPR:$ldst,GPR:$hdst),(ins GPR:$a,GPR:$b),
-                  NoItinerary, !strconcat(opc, "ldx"),"\t$ldst, $hdst, $a, $b">;
+  def LDX : AMulDualI64<1, sub, 1, (outs GPR:$RdLo,GPR:$RdHi),
+                  (ins GPR:$Rn, GPR:$Rm), NoItinerary,
+                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
 
 }
 
@@ -2180,16 +2929,10 @@ defm SMLS : AI_smld<1, "smls">;
 
 multiclass AI_sdml<bit sub, string opc> {
 
-  def D : AMulDualI<0, sub, 0, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                    NoItinerary, !strconcat(opc, "d"), "\t$dst, $a, $b"> {
-    let Inst{15-12} = 0b1111;
-  }
-
-  def DX : AMulDualI<0, sub, 1, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                    NoItinerary, !strconcat(opc, "dx"), "\t$dst, $a, $b"> {
-    let Inst{15-12} = 0b1111;
-  }
-
+  def D : AMulDualI<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                    NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+  def DX : AMulDualI<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                    NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
 }
 
 defm SMUA : AI_sdml<0, "smua">;
@@ -2199,55 +2942,35 @@ defm SMUS : AI_sdml<1, "smus">;
 //  Misc. Arithmetic Instructions.
 //
 
-def CLZ  : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
-              "clz", "\t$dst, $src",
-              [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> {
-  let Inst{7-4}   = 0b0001;
-  let Inst{11-8}  = 0b1111;
-  let Inst{19-16} = 0b1111;
-}
-
-def RBIT : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
-              "rbit", "\t$dst, $src",
-              [(set GPR:$dst, (ARMrbit GPR:$src))]>,
-           Requires<[IsARM, HasV6T2]> {
-  let Inst{7-4}   = 0b0011;
-  let Inst{11-8}  = 0b1111;
-  let Inst{19-16} = 0b1111;
-}
-
-def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
-              "rev", "\t$dst, $src",
-              [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b0011;
-  let Inst{11-8}  = 0b1111;
-  let Inst{19-16} = 0b1111;
-}
-
-def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
-               "rev16", "\t$dst, $src",
-               [(set GPR:$dst,
-                   (or (and (srl GPR:$src, (i32 8)), 0xFF),
-                       (or (and (shl GPR:$src, (i32 8)), 0xFF00),
-                           (or (and (srl GPR:$src, (i32 8)), 0xFF0000),
-                               (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>,
-               Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b1011;
-  let Inst{11-8}  = 0b1111;
-  let Inst{19-16} = 0b1111;
-}
-
-def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
-               "revsh", "\t$dst, $src",
-               [(set GPR:$dst,
+def CLZ  : AMiscA1I<0b000010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "clz", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>;
+
+def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rbit", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>,
+           Requires<[IsARM, HasV6T2]>;
+
+def REV  : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rev", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>;
+
+def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "rev16", "\t$Rd, $Rm",
+               [(set GPR:$Rd,
+                   (or (and (srl GPR:$Rm, (i32 8)), 0xFF),
+                       (or (and (shl GPR:$Rm, (i32 8)), 0xFF00),
+                           (or (and (srl GPR:$Rm, (i32 8)), 0xFF0000),
+                               (and (shl GPR:$Rm, (i32 8)), 0xFF000000)))))]>,
+               Requires<[IsARM, HasV6]>;
+
+def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "revsh", "\t$Rd, $Rm",
+               [(set GPR:$Rd,
                   (sext_inreg
-                    (or (srl (and GPR:$src, 0xFF00), (i32 8)),
-                        (shl GPR:$src, (i32 8))), i16))]>,
-               Requires<[IsARM, HasV6]> {
-  let Inst{7-4}   = 0b1011;
-  let Inst{11-8}  = 0b1111;
-  let Inst{19-16} = 0b1111;
-}
+                    (or (srl (and GPR:$Rm, 0xFF00), (i32 8)),
+                        (shl GPR:$Rm, (i32 8))), i16))]>,
+               Requires<[IsARM, HasV6]>;
 
 def lsl_shift_imm : SDNodeXForm<imm, [{
   unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::lsl, N->getZExtValue());
@@ -2258,21 +2981,19 @@ def lsl_amt : PatLeaf<(i32 imm), [{
   return (N->getZExtValue() < 32);
 }], lsl_shift_imm>;
 
-def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
-                                 (ins GPR:$src1, GPR:$src2, shift_imm:$sh),
-               IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2$sh",
-               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
-                                   (and (shl GPR:$src2, lsl_amt:$sh),
-                                        0xFFFF0000)))]>,
-               Requires<[IsARM, HasV6]> {
-  let Inst{6-4} = 0b001;
-}
+def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd),
+                              (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh),
+               IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF),
+                                  (and (shl GPR:$Rm, lsl_amt:$sh),
+                                       0xFFFF0000)))]>,
+               Requires<[IsARM, HasV6]>;
 
 // Alternate cases for PKHBT where identities eliminate some nodes.
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
-               (PKHBT GPR:$src1, GPR:$src2, 0)>;
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$sh)),
-               (PKHBT GPR:$src1, GPR:$src2, (lsl_shift_imm imm16_31:$sh))>;
+def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (and GPR:$Rm, 0xFFFF0000)),
+               (PKHBT GPR:$Rn, GPR:$Rm, 0)>;
+def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (shl GPR:$Rm, imm16_31:$sh)),
+               (PKHBT GPR:$Rn, GPR:$Rm, (lsl_shift_imm imm16_31:$sh))>;
 
 def asr_shift_imm : SDNodeXForm<imm, [{
   unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::asr, N->getZExtValue());
@@ -2285,15 +3006,13 @@ def asr_amt : PatLeaf<(i32 imm), [{
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
-def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
-                                 (ins GPR:$src1, GPR:$src2, shift_imm:$sh),
-               IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2$sh",
-               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
-                                   (and (sra GPR:$src2, asr_amt:$sh),
-                                        0xFFFF)))]>,
-               Requires<[IsARM, HasV6]> {
-  let Inst{6-4} = 0b101;
-}
+def PKHTB : APKHI<0b01101000, 1, (outs GPR:$Rd),
+                              (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh),
+               IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF0000),
+                                  (and (sra GPR:$Rm, asr_amt:$sh),
+                                       0xFFFF)))]>,
+               Requires<[IsARM, HasV6]>;
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
@@ -2308,10 +3027,19 @@ def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
 //
 
 defm CMP  : AI1_cmp_irs<0b1010, "cmp",
+                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
                         BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
 
-// FIXME: There seems to be a (potential) hardware bug with the CMN instruction
-// and comparison with 0. These two pieces of code should give identical
+// ARMcmpZ can re-use the above instruction definitions.
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm),
+             (CMPri   GPR:$src, so_imm:$imm)>;
+def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
+             (CMPrr   GPR:$src, GPR:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg:$rhs),
+             (CMPrs   GPR:$src, so_reg:$rhs)>;
+
+// FIXME: We have to be careful when using the CMN instruction and comparison
+// with 0. One would expect these two pieces of code should give identical
 // results:
 //
 //   rsbs r1, r1, 0
@@ -2321,7 +3049,7 @@ defm CMP  : AI1_cmp_irs<0b1010, "cmp",
 //   mov  r0, #1
 //
 // and:
-// 
+//
 //   cmn  r0, r1
 //   mov  r0, #0
 //   it   ls
@@ -2336,20 +3064,16 @@ defm CMP  : AI1_cmp_irs<0b1010, "cmp",
 // never a "carry" when this AddWithCarry is performed (because the "carry bit"
 // parameter to AddWithCarry is defined as 0).
 //
-// The AddWithCarry in the CMP case seems to be relying upon the identity:
-// 
-//   ~x + 1 = -x
-//
-// However when x is 0 and unsigned, this doesn't hold:
+// When x is 0 and unsigned:
 //
 //    x = 0
 //   ~x = 0xFFFF FFFF
 //   ~x + 1 = 0x1 0000 0000
 //   (-x = 0) != (0x1 0000 0000 = ~x + 1)
 //
-// Therefore, we should disable *all* versions of CMN, especially when comparing
-// against zero, until we can limit when the CMN instruction is used (when we
-// know that the RHS is not 0) or when we have a hardware fix for this.
+// Therefore, we should disable CMN when comparing against zero, until we can
+// limit when the CMN instruction is used (when we know that the RHS is not 0 or
+// when it's a comparison which doesn't look at the 'carry' flag).
 //
 // (See the ARM docs for the "AddWithCarry" pseudo-code.)
 //
@@ -2360,13 +3084,14 @@ defm CMP  : AI1_cmp_irs<0b1010, "cmp",
 
 // Note that TST/TEQ don't set all the same flags that CMP does!
 defm TST  : AI1_cmp_irs<0b1000, "tst",
-                        BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>, 1>;
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1>;
 defm TEQ  : AI1_cmp_irs<0b1001, "teq",
-                        BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>, 1>;
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
 
-defm CMPz  : AI1_cmp_irs<0b1010, "cmp",
-                         BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>;
 defm CMNz  : AI1_cmp_irs<0b1011, "cmn",
+                         IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
 
 //def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm),
@@ -2381,13 +3106,10 @@ let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
 def BCCi64 : PseudoInst<(outs),
     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
      IIC_Br,
-     "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, imm:$cc",
     [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>;
 
 def BCCZi64 : PseudoInst<(outs),
-     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst),
-      IIC_Br,
-     "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, 0, 0, imm:$cc",
+     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
     [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>;
 } // usesCustomInserter
 
@@ -2395,29 +3117,87 @@ def BCCZi64 : PseudoInst<(outs),
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
 // a two-value operand where a dag node expects two operands. :(
+// FIXME: These should all be pseudo-instructions that get expanded to
+//        the normal MOV instructions. That would fix the dependency on
+//        special casing them in tblgen.
 let neverHasSideEffects = 1 in {
-def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm,
-                IIC_iCMOVr, "mov", "\t$dst, $true",
-      [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $dst">, UnaryDP {
-  let Inst{11-4} = 0b00000000;
+def MOVCCr : AI1<0b1101, (outs GPR:$Rd), (ins GPR:$false, GPR:$Rm), DPFrm,
+                IIC_iCMOVr, "mov", "\t$Rd, $Rm",
+      [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $Rd">, UnaryDP {
+  bits<4> Rd;
+  bits<4> Rm;
   let Inst{25} = 0;
+  let Inst{20} = 0;
+  let Inst{15-12} = Rd;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rm;
 }
 
-def MOVCCs : AI1<0b1101, (outs GPR:$dst),
-                        (ins GPR:$false, so_reg:$true), DPSoRegFrm, IIC_iCMOVsr,
-                "mov", "\t$dst, $true",
-   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $dst">, UnaryDP {
+def MOVCCs : AI1<0b1101, (outs GPR:$Rd),
+                 (ins GPR:$false, so_reg:$shift), DPSoRegFrm, IIC_iCMOVsr,
+                "mov", "\t$Rd, $shift",
+   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $Rd">, UnaryDP {
+  bits<4> Rd;
+  bits<12> shift;
   let Inst{25} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = shift;
 }
 
-def MOVCCi : AI1<0b1101, (outs GPR:$dst),
-                        (ins GPR:$false, so_imm:$true), DPFrm, IIC_iCMOVi,
-                "mov", "\t$dst, $true",
-   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $dst">, UnaryDP {
+let isMoveImm = 1 in
+def MOVCCi16 : AI1<0b1000, (outs GPR:$Rd), (ins GPR:$false, i32imm_hilo16:$imm),
+                 DPFrm, IIC_iMOVi,
+                 "movw", "\t$Rd, $imm",
+                 []>,
+                 RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
+                 UnaryDP {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 0;
+  let Inst{19-16} = imm{15-12};
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+}
+
+let isMoveImm = 1 in
+def MOVCCi : AI1<0b1101, (outs GPR:$Rd),
+                         (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi,
+                "mov", "\t$Rd, $imm",
+   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $Rd">, UnaryDP {
+  bits<4> Rd;
+  bits<12> imm;
   let Inst{25} = 1;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = imm;
+}
+
+// Two instruction predicate mov immediate.
+let isMoveImm = 1 in
+def MOVCCi32imm : PseudoInst<(outs GPR:$Rd),
+                             (ins GPR:$false, i32imm:$src, pred:$p),
+                  IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">;
+
+let isMoveImm = 1 in
+def MVNCCi : AI1<0b1111, (outs GPR:$Rd),
+                         (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi,
+                "mvn", "\t$Rd, $imm",
+ [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $Rd">, UnaryDP {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = imm;
 }
 } // neverHasSideEffects
 
@@ -2425,64 +3205,41 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst),
 // Atomic operations intrinsics
 //
 
+def memb_opt : Operand<i32> {
+  let PrintMethod = "printMemBOption";
+  let ParserMatchClass = MemBarrierOptOperand;
+}
+
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
-def DMBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "dmb", "",
-                  [(ARMMemBarrier)]>, Requires<[IsARM, HasDB]> {
+def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
   let Inst{31-4} = 0xf57ff05;
-  // FIXME: add support for options other than a full system DMB
-  // See DMB disassembly-only variants below.
-  let Inst{3-0} = 0b1111;
-}
-
-def DSBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "dsb", "",
-                  [(ARMSyncBarrier)]>, Requires<[IsARM, HasDB]> {
-  let Inst{31-4} = 0xf57ff04;
-  // FIXME: add support for options other than a full system DSB
-  // See DSB disassembly-only variants below.
-  let Inst{3-0} = 0b1111;
+  let Inst{3-0} = opt;
 }
 
 def DMB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary,
                        "mcr", "\tp15, 0, $zero, c7, c10, 5",
                        [(ARMMemBarrierMCR GPR:$zero)]>,
                        Requires<[IsARM, HasV6]> {
-  // FIXME: add support for options other than a full system DMB
   // FIXME: add encoding
 }
-
-def DSB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary,
-                        "mcr", "\tp15, 0, $zero, c7, c10, 4",
-                        [(ARMSyncBarrierMCR GPR:$zero)]>,
-                        Requires<[IsARM, HasV6]> {
-  // FIXME: add support for options other than a full system DSB
-  // FIXME: add encoding
-}
-}
-
-// Memory Barrier Operations Variants -- for disassembly only
-
-def memb_opt : Operand<i32> {
-  let PrintMethod = "printMemBOption";
 }
 
-class AMBI<bits<4> op7_4, string opc>
-  : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, opc, "\t$opt",
-          [/* For disassembly only; pattern left blank */]>,
-    Requires<[IsARM, HasDB]> {
-  let Inst{31-8} = 0xf57ff0;
-  let Inst{7-4} = op7_4;
+def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dsb", "\t$opt",
+                [/* For disassembly only; pattern left blank */]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff04;
+  let Inst{3-0} = opt;
 }
 
-// These DMB variants are for disassembly only.
-def DMBvar : AMBI<0b0101, "dmb">;
-
-// These DSB variants are for disassembly only.
-def DSBvar : AMBI<0b0100, "dsb">;
-
 // ISB has only full system option -- for disassembly only
-def ISBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "isb", "", []>,
-            Requires<[IsARM, HasDB]> {
+def ISB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "isb", "", []>,
+                Requires<[IsARM, HasDB]> {
   let Inst{31-4} = 0xf57ff06;
   let Inst{3-0} = 0b1111;
 }
@@ -2491,138 +3248,114 @@ let usesCustomInserter = 1 in {
   let Uses = [CPSR] in {
     def ATOMIC_LOAD_ADD_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_ADD_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_SUB_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_SUB_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_AND_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_AND_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_OR_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_OR_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_XOR_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_XOR_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_NAND_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_NAND_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_ADD_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_ADD_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_SUB_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_SUB_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_AND_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_AND_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_OR_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_OR_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_XOR_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_XOR_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_NAND_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_NAND_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_ADD_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_ADD_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_SUB_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_SUB_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_AND_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_AND_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_OR_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_OR_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_XOR_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_XOR_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>;
     def ATOMIC_LOAD_NAND_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      "${:comment} ATOMIC_LOAD_NAND_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
 
     def ATOMIC_SWAP_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      "${:comment} ATOMIC_SWAP_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>;
     def ATOMIC_SWAP_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      "${:comment} ATOMIC_SWAP_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>;
     def ATOMIC_SWAP_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      "${:comment} ATOMIC_SWAP_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>;
 
     def ATOMIC_CMP_SWAP_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      "${:comment} ATOMIC_CMP_SWAP_I8 PSEUDO!",
       [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>;
     def ATOMIC_CMP_SWAP_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      "${:comment} ATOMIC_CMP_SWAP_I16 PSEUDO!",
       [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>;
     def ATOMIC_CMP_SWAP_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      "${:comment} ATOMIC_CMP_SWAP_I32 PSEUDO!",
       [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>;
 }
 }
 
 let mayLoad = 1 in {
-def LDREXB : AIldrex<0b10, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
-                    "ldrexb", "\t$dest, [$ptr]",
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary,
+                    "ldrexb", "\t$Rt, [$Rn]",
                     []>;
-def LDREXH : AIldrex<0b11, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
-                    "ldrexh", "\t$dest, [$ptr]",
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary,
+                    "ldrexh", "\t$Rt, [$Rn]",
                     []>;
-def LDREX  : AIldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
-                    "ldrex", "\t$dest, [$ptr]",
+def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary,
+                    "ldrex", "\t$Rt, [$Rn]",
                     []>;
-def LDREXD : AIldrex<0b01, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr),
+def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins GPR:$Rn),
                     NoItinerary,
-                    "ldrexd", "\t$dest, $dest2, [$ptr]",
+                    "ldrexd", "\t$Rt, $Rt2, [$Rn]",
                     []>;
 }
 
-let mayStore = 1, Constraints = "@earlyclobber $success" in {
-def STREXB : AIstrex<0b10, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$src, GPR:$Rn),
                     NoItinerary,
-                    "strexb", "\t$success, $src, [$ptr]",
+                    "strexb", "\t$Rd, $src, [$Rn]",
                     []>;
-def STREXH : AIstrex<0b11, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn),
                     NoItinerary,
-                    "strexh", "\t$success, $src, [$ptr]",
+                    "strexh", "\t$Rd, $Rt, [$Rn]",
                     []>;
-def STREX  : AIstrex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+def STREX  : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn),
                     NoItinerary,
-                    "strex", "\t$success, $src, [$ptr]",
+                    "strex", "\t$Rd, $Rt, [$Rn]",
                     []>;
-def STREXD : AIstrex<0b01, (outs GPR:$success),
-                    (ins GPR:$src, GPR:$src2, GPR:$ptr),
+def STREXD : AIstrex<0b01, (outs GPR:$Rd),
+                    (ins GPR:$Rt, GPR:$Rt2, GPR:$Rn),
                     NoItinerary,
-                    "strexd", "\t$success, $src, $src2, [$ptr]",
+                    "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]",
                     []>;
 }
 
@@ -2630,29 +3363,15 @@ def STREXD : AIstrex<0b01, (outs GPR:$success),
 def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
                 [/* For disassembly only; pattern left blank */]>,
             Requires<[IsARM, HasV7]>  {
-  let Inst{31-20} = 0xf57;
-  let Inst{7-4} = 0b0001;
+  let Inst{31-0} = 0b11110101011111111111000000011111;
 }
 
 // SWP/SWPB are deprecated in V6/V7 and for disassembly only.
 let mayLoad = 1 in {
-def SWP : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary,
-             "swp", "\t$dst, $src, [$ptr]",
-             [/* For disassembly only; pattern left blank */]> {
-  let Inst{27-23} = 0b00010;
-  let Inst{22} = 0; // B = 0
-  let Inst{21-20} = 0b00;
-  let Inst{7-4} = 0b1001;
-}
-
-def SWPB : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary,
-             "swpb", "\t$dst, $src, [$ptr]",
-             [/* For disassembly only; pattern left blank */]> {
-  let Inst{27-23} = 0b00010;
-  let Inst{22} = 1; // B = 1
-  let Inst{21-20} = 0b00;
-  let Inst{7-4} = 0b1001;
-}
+def SWP  : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swp",
+             [/* For disassembly only; pattern left blank */]>;
+def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb",
+             [/* For disassembly only; pattern left blank */]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2660,10 +3379,11 @@ def SWPB : AI<(outs GPR:$dst), (ins GPR:$src, GPR:$ptr), LdStExFrm, NoItinerary,
 //
 
 // __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right, 
+// complete with fixup for the aeabi_read_tp function.
 let isCall = 1,
-  Defs = [R0, R12, LR, CPSR] in {
-  def TPsoft : ABXI<0b1011, (outs), (ins), IIC_Br,
-               "bl\t__aeabi_read_tp",
+  Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
+  def TPsoft : PseudoInst<(outs), (ins), IIC_Br,
                [(set R0, ARMthread_pointer)]>;
 }
 
@@ -2680,19 +3400,16 @@ let isCall = 1,
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
 //   A constant value is passed in $val, and we use the location as a scratch.
+//
+// These are pseudo-instructions and are lowered to individual MC-insts, so
+// no encoding information is necessary.
 let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
     D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
     D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
     D31 ], hasSideEffects = 1, isBarrier = 1 in {
-  def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src, GPR:$val),
-                               AddrModeNone, SizeSpecial, IndexModeNone,
-                               Pseudo, NoItinerary,
-                           "add\t$val, pc, #8\t${:comment} eh_setjmp begin\n\t"
-                           "str\t$val, [$src, #+4]\n\t"
-                           "mov\tr0, #0\n\t"
-                           "add\tpc, pc, #0\n\t"
-                           "mov\tr0, #1 ${:comment} eh_setjmp end", "",
+  def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                               NoItinerary,
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
                            Requires<[IsARM, HasVFP2]>;
 }
@@ -2700,14 +3417,8 @@ let Defs =
 let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
   hasSideEffects = 1, isBarrier = 1 in {
-  def Int_eh_sjlj_setjmp_nofp : XI<(outs), (ins GPR:$src, GPR:$val),
-                                   AddrModeNone, SizeSpecial, IndexModeNone,
-                                   Pseudo, NoItinerary,
-                           "add\t$val, pc, #8\n ${:comment} eh_setjmp begin\n\t"
-                           "str\t$val, [$src, #+4]\n\t"
-                           "mov\tr0, #0\n\t"
-                           "add\tpc, pc, #0\n\t"
-                           "mov\tr0, #1 ${:comment} eh_setjmp end", "",
+  def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                                   NoItinerary,
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
                                 Requires<[IsARM, NoVFP]>;
 }
@@ -2715,53 +3426,58 @@ let Defs =
 // FIXME: Non-Darwin version(s)
 let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
     Defs = [ R7, LR, SP ] in {
-def Int_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
-                             AddrModeNone, SizeSpecial, IndexModeNone,
-                             Pseudo, NoItinerary,
-                             "ldr\tsp, [$src, #8]\n\t"
-                             "ldr\t$scratch, [$src, #4]\n\t"
-                             "ldr\tr7, [$src]\n\t"
-                             "bx\t$scratch", "",
+def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
+                             NoItinerary,
                          [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
                                 Requires<[IsARM, IsDarwin]>;
 }
 
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are
+// handled when the pseudo is expanded (which happens before any passes
+// that need the instruction size).
+let isBarrier = 1, hasSideEffects = 1 in
+def Int_eh_sjlj_dispatchsetup :
+ PseudoInst<(outs), (ins GPR:$src), NoItinerary,
+            [(ARMeh_sjlj_dispatchsetup GPR:$src)]>,
+              Requires<[IsDarwin]>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
 
 // Large immediate handling.
 
-// Two piece so_imms.
-let isReMaterializable = 1 in
-def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src),
-                         Pseudo, IIC_iMOVi,
-                         "mov", "\t$dst, $src",
-                         [(set GPR:$dst, so_imm2part:$src)]>,
-                  Requires<[IsARM, NoV6T2]>;
-
-def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
-             (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
-                    (so_imm2part_2 imm:$RHS))>;
-def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
-             (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
-                    (so_imm2part_2 imm:$RHS))>;
-def : ARMPat<(add GPR:$LHS, so_imm2part:$RHS),
-             (ADDri (ADDri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
-                    (so_imm2part_2 imm:$RHS))>;
-def : ARMPat<(add GPR:$LHS, so_neg_imm2part:$RHS),
-             (SUBri (SUBri GPR:$LHS, (so_neg_imm2part_1 imm:$RHS)),
-                    (so_neg_imm2part_2 imm:$RHS))>;
-
-// 32-bit immediate using movw + movt.
+// 32-bit immediate using two piece so_imms or movw + movt.
 // This is a single pseudo instruction, the benefit is that it can be remat'd
 // as a single unit instead of having to handle reg inputs.
 // FIXME: Remove this when we can do generalized remat.
-let isReMaterializable = 1 in
-def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
-                   "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
-                     [(set GPR:$dst, (i32 imm:$src))]>,
-               Requires<[IsARM, HasV6T2]>;
+let isReMaterializable = 1, isMoveImm = 1 in
+def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                           [(set GPR:$dst, (arm_i32imm:$src))]>,
+                           Requires<[IsARM]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if PIC).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              IIC_iMOVix2addpc,
+                        [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                             IIC_iMOVix2,
+                        [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+let AddedComplexity = 10 in
+def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2ld,
+                    [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                    Requires<[IsARM, UseMovt]>;
+} // isReMaterializable
 
 // ConstantPool, GlobalAddress, and JumpTable
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
@@ -2800,11 +3516,15 @@ def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
       Requires<[IsARM, IsDarwin]>;
 
 // zextload i1 -> zextload i8
-def : ARMPat<(zextloadi1 addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(zextloadi1 ldst_so_reg:$addr),    (LDRBrs ldst_so_reg:$addr)>;
 
 // extload -> zextload
-def : ARMPat<(extloadi1  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
-def : ARMPat<(extloadi8  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(extloadi1 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi1 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+def : ARMPat<(extloadi8 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi8 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+
 def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
 
 def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
@@ -2889,19 +3609,45 @@ include "ARMInstrNEON.td"
 // Coprocessor Instructions.  For disassembly only.
 //
 
-def CDP : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
-            nohash_imm:$CRd, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2),
-            NoItinerary, "cdp", "\tp$cop, $opc1, cr$CRd, cr$CRn, cr$CRm, $opc2",
-              [/* For disassembly only; pattern left blank */]> {
-  let Inst{4} = 0;
-}
-
-def CDP2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
-               nohash_imm:$CRd, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2),
-               NoItinerary, "cdp2\tp$cop, $opc1, cr$CRd, cr$CRn, cr$CRm, $opc2",
+def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
+            c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+            NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+            [/* For disassembly only; pattern left blank */]> {
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
+               c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+               NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
-  let Inst{4} = 0;
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
 }
 
 class ACI<dag oops, dag iops, string opc, string asm>
@@ -3000,110 +3746,164 @@ defm LDC2 : LdStCop<0b1111,    1, "ldc2">;
 defm STC  : LdStCop<{?,?,?,?}, 0, "stc">;
 defm STC2 : LdStCop<0b1111,    0, "stc2">;
 
-def MCR : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
-              GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2),
-              NoItinerary, "mcr", "\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2",
-              [/* For disassembly only; pattern left blank */]> {
-  let Inst{20} = 0;
-  let Inst{4} = 1;
-}
-
-def MCR2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
-                GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2),
-                NoItinerary, "mcr2\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2",
-                [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{20} = 0;
-  let Inst{4} = 1;
-}
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register -- for disassembly only
+//
 
-def MRC : ABI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
-              GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2),
-              NoItinerary, "mrc", "\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2",
-              [/* For disassembly only; pattern left blank */]> {
-  let Inst{20} = 1;
+class MovRCopro<string opc, bit direction>
+  : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
+        GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+        NoItinerary, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
+        [/* For disassembly only; pattern left blank */]> {
+  let Inst{20} = direction;
   let Inst{4} = 1;
-}
 
-def MRC2 : ABXI<0b1110, (outs), (ins nohash_imm:$cop, i32imm:$opc1,
-                GPR:$Rt, nohash_imm:$CRn, nohash_imm:$CRm, i32imm:$opc2),
-                NoItinerary, "mrc2\tp$cop, $opc1, $Rt, cr$CRn, cr$CRm, $opc2",
-                [/* For disassembly only; pattern left blank */]> {
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>;
+def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>;
+
+class MovRCopro2<string opc, bit direction>
+  : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
+         GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+         NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+         [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
-  let Inst{20} = 1;
+  let Inst{20} = direction;
   let Inst{4} = 1;
-}
 
-def MCRR : ABI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc,
-               GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm),
-               NoItinerary, "mcrr", "\tp$cop, $opc, $Rt, $Rt2, cr$CRm",
-               [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0100;
-}
-
-def MCRR2 : ABXI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc,
-                 GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm),
-                 NoItinerary, "mcrr2\tp$cop, $opc, $Rt, $Rt2, cr$CRm",
-                 [/* For disassembly only; pattern left blank */]> {
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */>;
+def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */>;
+
+class MovRRCopro<string opc, bit direction>
+  : ABI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1,
+        GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
+        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm",
+        [/* For disassembly only; pattern left blank */]> {
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>;
+def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
+
+class MovRRCopro2<string opc, bit direction>
+  : ABXI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1,
+         GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
+         NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
+         [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
-  let Inst{23-20} = 0b0100;
-}
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
 
-def MRRC : ABI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc,
-               GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm),
-               NoItinerary, "mrrc", "\tp$cop, $opc, $Rt, $Rt2, cr$CRm",
-               [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0101;
-}
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
 
-def MRRC2 : ABXI<0b1100, (outs), (ins nohash_imm:$cop, i32imm:$opc,
-                 GPR:$Rt, GPR:$Rt2, nohash_imm:$CRm),
-                 NoItinerary, "mrrc2\tp$cop, $opc, $Rt, $Rt2, cr$CRm",
-                 [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{23-20} = 0b0101;
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
 }
 
+def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */>;
+def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
+
 //===----------------------------------------------------------------------===//
 // Move between special register and ARM core register -- for disassembly only
 //
 
-def MRS : ABI<0b0001,(outs GPR:$dst),(ins), NoItinerary, "mrs", "\t$dst, cpsr",
+// Move to ARM core register from Special Register
+def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr",
               [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0000;
+  bits<4> Rd;
+  let Inst{23-16} = 0b00001111;
+  let Inst{15-12} = Rd;
   let Inst{7-4} = 0b0000;
 }
 
-def MRSsys : ABI<0b0001,(outs GPR:$dst),(ins), NoItinerary,"mrs","\t$dst, spsr",
+def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,"mrs","\t$Rd, spsr",
               [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0100;
+  bits<4> Rd;
+  let Inst{23-16} = 0b01001111;
+  let Inst{15-12} = Rd;
   let Inst{7-4} = 0b0000;
 }
 
-def MSR : ABI<0b0001, (outs), (ins GPR:$src, msr_mask:$mask), NoItinerary,
-              "msr", "\tcpsr$mask, $src",
+// Move from ARM core register to Special Register
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
+              "msr", "\t$mask, $Rn",
               [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0010;
-  let Inst{7-4} = 0b0000;
-}
+  bits<5> mask;
+  bits<4> Rn;
 
-def MSRi : ABI<0b0011, (outs), (ins so_imm:$a, msr_mask:$mask), NoItinerary,
-              "msr", "\tcpsr$mask, $a",
-              [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0010;
-  let Inst{7-4} = 0b0000;
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rn;
 }
 
-def MSRsys : ABI<0b0001, (outs), (ins GPR:$src, msr_mask:$mask), NoItinerary,
-              "msr", "\tspsr$mask, $src",
-              [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0110;
-  let Inst{7-4} = 0b0000;
-}
+def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
+               "msr", "\t$mask, $a",
+               [/* For disassembly only; pattern left blank */]> {
+  bits<5> mask;
+  bits<12> a;
 
-def MSRsysi : ABI<0b0011, (outs), (ins so_imm:$a, msr_mask:$mask), NoItinerary,
-              "msr", "\tspsr$mask, $a",
-              [/* For disassembly only; pattern left blank */]> {
-  let Inst{23-20} = 0b0110;
-  let Inst{7-4} = 0b0000;
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-0} = a;
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
index 4d2f116..1e2e550 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -16,11 +16,17 @@
 //===----------------------------------------------------------------------===//
 
 def SDTARMVCMP    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def SDTARMVCMPZ   : SDTypeProfile<1, 1, []>;
 
 def NEONvceq      : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
+def NEONvceqz     : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
 def NEONvcge      : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
+def NEONvcgez     : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
+def NEONvclez     : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
 def NEONvcgeu     : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
 def NEONvcgt      : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
+def NEONvcgtz     : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
+def NEONvcltz     : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
 def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
 def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
 
@@ -69,6 +75,11 @@ def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
 def NEONvmovImm   : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
 def NEONvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
 
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                           SDTCisVT<2, i32>]>;
+def NEONvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def NEONvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
+
 def NEONvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
 
 // VDUPLANE can produce a quad-register result from a double-register source,
@@ -129,830 +140,1506 @@ def nModImm : Operand<i32> {
 // NEON load / store instructions
 //===----------------------------------------------------------------------===//
 
-// Use vldmia to load a Q register as a D register pair.
-// This is equivalent to VLDMD except that it has a Q register operand
-// instead of a pair of D registers.
-def VLDMQ
-  : AXDI4<(outs QPR:$dst), (ins addrmode4:$addr, pred:$p),
-          IndexModeNone, IIC_fpLoadm,
-          "vldm${addr:submode}${p}\t$addr, ${dst:dregpair}", "",
-          [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]>;
-
-let mayLoad = 1, neverHasSideEffects = 1 in {
-// Use vld1 to load a Q register as a D register pair.
-// This alternative to VLDMQ allows an alignment to be specified.
-// This is equivalent to VLD1q64 except that it has a Q register operand.
-def VLD1q
-  : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr),
-          IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>;
-} // mayLoad = 1, neverHasSideEffects = 1
-
-// Use vstmia to store a Q register as a D register pair.
-// This is equivalent to VSTMD except that it has a Q register operand
-// instead of a pair of D registers.
-def VSTMQ
-  : AXDI4<(outs), (ins QPR:$src, addrmode4:$addr, pred:$p),
-          IndexModeNone, IIC_fpStorem,
-          "vstm${addr:submode}${p}\t$addr, ${src:dregpair}", "",
-          [(store (v2f64 QPR:$src), addrmode4:$addr)]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in {
-// Use vst1 to store a Q register as a D register pair.
-// This alternative to VSTMQ allows an alignment to be specified.
-// This is equivalent to VST1q64 except that it has a Q register operand.
-def VST1q
-  : NLdSt<0,0b00,0b1010,0b1100, (outs), (ins addrmode6:$addr, QPR:$src),
-          IIC_VST, "vst1", "64", "${src:dregpair}, $addr", "", []>;
-} // mayStore = 1, neverHasSideEffects = 1
-
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+// Use VLDM to load a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VLDMD after reg alloc.
+def VLDMQIA
+  : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn),
+                    IIC_fpLoad_m, "",
+                   [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>;
+def VLDMQDB
+  : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn),
+                    IIC_fpLoad_m, "",
+                   [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>;
+
+// Use VSTM to store a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VSTMD after reg alloc.
+def VSTMQIA
+  : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn),
+                    IIC_fpStore_m, "",
+                   [(store (v2f64 QPR:$src), GPR:$Rn)]>;
+def VSTMQDB
+  : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn),
+                    IIC_fpStore_m, "",
+                   [(store (v2f64 QPR:$src), GPR:$Rn)]>;
 
 // Classes for VLD* pseudo-instructions with multi-register operands.
 // These are expanded to real instructions after register allocation.
-class VLDQPseudo
-  : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), IIC_VST, "">;
-class VLDQWBPseudo
+class VLDQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
-                (ins addrmode6:$addr, am6offset:$offset), IIC_VST,
+                (ins addrmode6:$addr, am6offset:$offset), itin,
                 "$addr.addr = $wb">;
-class VLDQQPseudo
-  : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), IIC_VST, "">;
-class VLDQQWBPseudo
+class VLDQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
-                (ins addrmode6:$addr, am6offset:$offset), IIC_VST,
+                (ins addrmode6:$addr, am6offset:$offset), itin,
                 "$addr.addr = $wb">;
-class VLDQQQQWBPseudo
+class VLDQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src), itin,"">;
+class VLDQQQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
-                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST,
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
                 "$addr.addr = $wb, $src = $dst">;
 
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+
 //   VLD1     : Vector Load (multiple single elements)
 class VLD1D<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst),
-          (ins addrmode6:$addr), IIC_VLD1,
-          "vld1", Dt, "\\{$dst\\}, $addr", "", []>;
+  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd),
+          (ins addrmode6:$Rn), IIC_VLD1,
+          "vld1", Dt, "\\{$Vd\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
 class VLD1Q<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2),
-          (ins addrmode6:$addr), IIC_VLD1,
-          "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>;
+  : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6:$Rn), IIC_VLD1x2,
+          "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 
-def  VLD1d8   : VLD1D<0b0000, "8">;
-def  VLD1d16  : VLD1D<0b0100, "16">;
-def  VLD1d32  : VLD1D<0b1000, "32">;
-def  VLD1d64  : VLD1D<0b1100, "64">;
+def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
+def  VLD1d16  : VLD1D<{0,1,0,?}, "16">;
+def  VLD1d32  : VLD1D<{1,0,0,?}, "32">;
+def  VLD1d64  : VLD1D<{1,1,0,?}, "64">;
 
-def  VLD1q8   : VLD1Q<0b0000, "8">;
-def  VLD1q16  : VLD1Q<0b0100, "16">;
-def  VLD1q32  : VLD1Q<0b1000, "32">;
-def  VLD1q64  : VLD1Q<0b1100, "64">;
+def  VLD1q8   : VLD1Q<{0,0,?,?}, "8">;
+def  VLD1q16  : VLD1Q<{0,1,?,?}, "16">;
+def  VLD1q32  : VLD1Q<{1,0,?,?}, "32">;
+def  VLD1q64  : VLD1Q<{1,1,?,?}, "64">;
 
-def  VLD1q8Pseudo  : VLDQPseudo;
-def  VLD1q16Pseudo : VLDQPseudo;
-def  VLD1q32Pseudo : VLDQPseudo;
-def  VLD1q64Pseudo : VLDQPseudo;
+def  VLD1q8Pseudo  : VLDQPseudo<IIC_VLD1x2>;
+def  VLD1q16Pseudo : VLDQPseudo<IIC_VLD1x2>;
+def  VLD1q32Pseudo : VLDQPseudo<IIC_VLD1x2>;
+def  VLD1q64Pseudo : VLDQPseudo<IIC_VLD1x2>;
 
 // ...with address register writeback:
 class VLD1DWB<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-          "vld1", Dt, "\\{$dst\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1u,
+          "vld1", Dt, "\\{$Vd\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
 class VLD1QWB<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-          "vld1", Dt, "${dst:dregpair}, $addr$offset",
-          "$addr.addr = $wb", []>;
-
-def VLD1d8_UPD  : VLD1DWB<0b0000, "8">;
-def VLD1d16_UPD : VLD1DWB<0b0100, "16">;
-def VLD1d32_UPD : VLD1DWB<0b1000, "32">;
-def VLD1d64_UPD : VLD1DWB<0b1100, "64">;
-
-def VLD1q8_UPD  : VLD1QWB<0b0000, "8">;
-def VLD1q16_UPD : VLD1QWB<0b0100, "16">;
-def VLD1q32_UPD : VLD1QWB<0b1000, "32">;
-def VLD1q64_UPD : VLD1QWB<0b1100, "64">;
-
-def VLD1q8Pseudo_UPD  : VLDQWBPseudo;
-def VLD1q16Pseudo_UPD : VLDQWBPseudo;
-def VLD1q32Pseudo_UPD : VLDQWBPseudo;
-def VLD1q64Pseudo_UPD : VLDQWBPseudo;
+  : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x2u,
+          "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VLD1d8_UPD  : VLD1DWB<{0,0,0,?}, "8">;
+def VLD1d16_UPD : VLD1DWB<{0,1,0,?}, "16">;
+def VLD1d32_UPD : VLD1DWB<{1,0,0,?}, "32">;
+def VLD1d64_UPD : VLD1DWB<{1,1,0,?}, "64">;
+
+def VLD1q8_UPD  : VLD1QWB<{0,0,?,?}, "8">;
+def VLD1q16_UPD : VLD1QWB<{0,1,?,?}, "16">;
+def VLD1q32_UPD : VLD1QWB<{1,0,?,?}, "32">;
+def VLD1q64_UPD : VLD1QWB<{1,1,?,?}, "64">;
+
+def VLD1q8Pseudo_UPD  : VLDQWBPseudo<IIC_VLD1x2u>;
+def VLD1q16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>;
+def VLD1q32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>;
+def VLD1q64Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>;
 
 // ...with 3 registers (some of these are only for the disassembler):
 class VLD1D3<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
-          (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
+  : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn), IIC_VLD1x3, "vld1", Dt,
+          "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
 class VLD1D3WB<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>;
+  : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x3u, "vld1", Dt,
+          "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
 
-def VLD1d8T      : VLD1D3<0b0000, "8">;
-def VLD1d16T     : VLD1D3<0b0100, "16">;
-def VLD1d32T     : VLD1D3<0b1000, "32">;
-def VLD1d64T     : VLD1D3<0b1100, "64">;
+def VLD1d8T      : VLD1D3<{0,0,0,?}, "8">;
+def VLD1d16T     : VLD1D3<{0,1,0,?}, "16">;
+def VLD1d32T     : VLD1D3<{1,0,0,?}, "32">;
+def VLD1d64T     : VLD1D3<{1,1,0,?}, "64">;
 
-def VLD1d8T_UPD  : VLD1D3WB<0b0000, "8">;
-def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">;
-def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">;
-def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">;
+def VLD1d8T_UPD  : VLD1D3WB<{0,0,0,?}, "8">;
+def VLD1d16T_UPD : VLD1D3WB<{0,1,0,?}, "16">;
+def VLD1d32T_UPD : VLD1D3WB<{1,0,0,?}, "32">;
+def VLD1d64T_UPD : VLD1D3WB<{1,1,0,?}, "64">;
 
-def VLD1d64TPseudo     : VLDQQPseudo;
-def VLD1d64TPseudo_UPD : VLDQQWBPseudo;
+def VLD1d64TPseudo     : VLDQQPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudo_UPD : VLDQQWBPseudo<IIC_VLD1x3u>;
 
 // ...with 4 registers (some of these are only for the disassembler):
 class VLD1D4<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-          (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>;
+  : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn), IIC_VLD1x4, "vld1", Dt,
+          "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 class VLD1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0010,op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", "$addr.addr = $wb",
-          []>;
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x4u, "vld1", Dt,
+          "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", "$Rn.addr = $wb",
+          []> {
+  let Inst{5-4} = Rn{5-4};
+}
 
-def VLD1d8Q      : VLD1D4<0b0000, "8">;
-def VLD1d16Q     : VLD1D4<0b0100, "16">;
-def VLD1d32Q     : VLD1D4<0b1000, "32">;
-def VLD1d64Q     : VLD1D4<0b1100, "64">;
+def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8">;
+def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16">;
+def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32">;
+def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64">;
 
-def VLD1d8Q_UPD  : VLD1D4WB<0b0000, "8">;
-def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">;
-def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">;
-def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">;
+def VLD1d8Q_UPD  : VLD1D4WB<{0,0,?,?}, "8">;
+def VLD1d16Q_UPD : VLD1D4WB<{0,1,?,?}, "16">;
+def VLD1d32Q_UPD : VLD1D4WB<{1,0,?,?}, "32">;
+def VLD1d64Q_UPD : VLD1D4WB<{1,1,?,?}, "64">;
 
-def VLD1d64QPseudo     : VLDQQPseudo;
-def VLD1d64QPseudo_UPD : VLDQQWBPseudo;
+def VLD1d64QPseudo     : VLDQQPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudo_UPD : VLDQQWBPseudo<IIC_VLD1x4u>;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2),
-          (ins addrmode6:$addr), IIC_VLD2,
-          "vld2", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>;
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6:$Rn), IIC_VLD2,
+          "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 class VLD2Q<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, 0b0011, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-          (ins addrmode6:$addr), IIC_VLD2,
-          "vld2", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>;
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn), IIC_VLD2x2,
+          "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 
-def  VLD2d8   : VLD2D<0b1000, 0b0000, "8">;
-def  VLD2d16  : VLD2D<0b1000, 0b0100, "16">;
-def  VLD2d32  : VLD2D<0b1000, 0b1000, "32">;
+def  VLD2d8   : VLD2D<0b1000, {0,0,?,?}, "8">;
+def  VLD2d16  : VLD2D<0b1000, {0,1,?,?}, "16">;
+def  VLD2d32  : VLD2D<0b1000, {1,0,?,?}, "32">;
 
-def  VLD2q8   : VLD2Q<0b0000, "8">;
-def  VLD2q16  : VLD2Q<0b0100, "16">;
-def  VLD2q32  : VLD2Q<0b1000, "32">;
+def  VLD2q8   : VLD2Q<{0,0,?,?}, "8">;
+def  VLD2q16  : VLD2Q<{0,1,?,?}, "16">;
+def  VLD2q32  : VLD2Q<{1,0,?,?}, "32">;
 
-def  VLD2d8Pseudo  : VLDQPseudo;
-def  VLD2d16Pseudo : VLDQPseudo;
-def  VLD2d32Pseudo : VLDQPseudo;
+def  VLD2d8Pseudo  : VLDQPseudo<IIC_VLD2>;
+def  VLD2d16Pseudo : VLDQPseudo<IIC_VLD2>;
+def  VLD2d32Pseudo : VLDQPseudo<IIC_VLD2>;
 
-def  VLD2q8Pseudo  : VLDQQPseudo;
-def  VLD2q16Pseudo : VLDQQPseudo;
-def  VLD2q32Pseudo : VLDQQPseudo;
+def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
 
 // ...with address register writeback:
 class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD2,
-          "vld2", Dt, "\\{$dst1, $dst2\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2u,
+          "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
 class VLD2QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, 0b0011, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD2,
-          "vld2", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2x2u,
+          "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
 
-def VLD2d8_UPD  : VLD2DWB<0b1000, 0b0000, "8">;
-def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">;
-def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">;
+def VLD2d8_UPD  : VLD2DWB<0b1000, {0,0,?,?}, "8">;
+def VLD2d16_UPD : VLD2DWB<0b1000, {0,1,?,?}, "16">;
+def VLD2d32_UPD : VLD2DWB<0b1000, {1,0,?,?}, "32">;
 
-def VLD2q8_UPD  : VLD2QWB<0b0000, "8">;
-def VLD2q16_UPD : VLD2QWB<0b0100, "16">;
-def VLD2q32_UPD : VLD2QWB<0b1000, "32">;
+def VLD2q8_UPD  : VLD2QWB<{0,0,?,?}, "8">;
+def VLD2q16_UPD : VLD2QWB<{0,1,?,?}, "16">;
+def VLD2q32_UPD : VLD2QWB<{1,0,?,?}, "32">;
 
-def VLD2d8Pseudo_UPD  : VLDQWBPseudo;
-def VLD2d16Pseudo_UPD : VLDQWBPseudo;
-def VLD2d32Pseudo_UPD : VLDQWBPseudo;
+def VLD2d8Pseudo_UPD  : VLDQWBPseudo<IIC_VLD2u>;
+def VLD2d16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>;
+def VLD2d32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>;
 
-def VLD2q8Pseudo_UPD  : VLDQQWBPseudo;
-def VLD2q16Pseudo_UPD : VLDQQWBPseudo;
-def VLD2q32Pseudo_UPD : VLDQQWBPseudo;
+def VLD2q8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD2x2u>;
+def VLD2q16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>;
+def VLD2q32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>;
 
 // ...with double-spaced registers (for disassembly only):
-def VLD2b8      : VLD2D<0b1001, 0b0000, "8">;
-def VLD2b16     : VLD2D<0b1001, 0b0100, "16">;
-def VLD2b32     : VLD2D<0b1001, 0b1000, "32">;
-def VLD2b8_UPD  : VLD2DWB<0b1001, 0b0000, "8">;
-def VLD2b16_UPD : VLD2DWB<0b1001, 0b0100, "16">;
-def VLD2b32_UPD : VLD2DWB<0b1001, 0b1000, "32">;
+def VLD2b8      : VLD2D<0b1001, {0,0,?,?}, "8">;
+def VLD2b16     : VLD2D<0b1001, {0,1,?,?}, "16">;
+def VLD2b32     : VLD2D<0b1001, {1,0,?,?}, "32">;
+def VLD2b8_UPD  : VLD2DWB<0b1001, {0,0,?,?}, "8">;
+def VLD2b16_UPD : VLD2DWB<0b1001, {0,1,?,?}, "16">;
+def VLD2b32_UPD : VLD2DWB<0b1001, {1,0,?,?}, "32">;
 
 //   VLD3     : Vector Load (multiple 3-element structures)
 class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
-          (ins addrmode6:$addr), IIC_VLD3,
-          "vld3", Dt, "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn), IIC_VLD3,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
 
-def  VLD3d8   : VLD3D<0b0100, 0b0000, "8">;
-def  VLD3d16  : VLD3D<0b0100, 0b0100, "16">;
-def  VLD3d32  : VLD3D<0b0100, 0b1000, "32">;
+def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
+def  VLD3d16  : VLD3D<0b0100, {0,1,0,?}, "16">;
+def  VLD3d32  : VLD3D<0b0100, {1,0,0,?}, "32">;
 
-def  VLD3d8Pseudo  : VLDQQPseudo;
-def  VLD3d16Pseudo : VLDQQPseudo;
-def  VLD3d32Pseudo : VLDQQPseudo;
+def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
 
 // ...with address register writeback:
 class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD3,
-          "vld3", Dt, "\\{$dst1, $dst2, $dst3\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
-
-def VLD3d8_UPD  : VLD3DWB<0b0100, 0b0000, "8">;
-def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">;
-def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">;
-
-def VLD3d8Pseudo_UPD  : VLDQQWBPseudo;
-def VLD3d16Pseudo_UPD : VLDQQWBPseudo;
-def VLD3d32Pseudo_UPD : VLDQQWBPseudo;
-
-// ...with double-spaced registers (non-updating versions for disassembly only):
-def VLD3q8      : VLD3D<0b0101, 0b0000, "8">;
-def VLD3q16     : VLD3D<0b0101, 0b0100, "16">;
-def VLD3q32     : VLD3D<0b0101, 0b1000, "32">;
-def VLD3q8_UPD  : VLD3DWB<0b0101, 0b0000, "8">;
-def VLD3q16_UPD : VLD3DWB<0b0101, 0b0100, "16">;
-def VLD3q32_UPD : VLD3DWB<0b0101, 0b1000, "32">;
-
-def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo;
-def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo;
-def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo;
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+
+def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
+def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
+def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+
+// ...with double-spaced registers:
+def VLD3q8      : VLD3D<0b0101, {0,0,0,?}, "8">;
+def VLD3q16     : VLD3D<0b0101, {0,1,0,?}, "16">;
+def VLD3q32     : VLD3D<0b0101, {1,0,0,?}, "32">;
+def VLD3q8_UPD  : VLD3DWB<0b0101, {0,0,0,?}, "8">;
+def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
+def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo;
-def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo;
-def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo;
+def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+
+def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
 
 //   VLD4     : Vector Load (multiple 4-element structures)
 class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-          (ins addrmode6:$addr), IIC_VLD4,
-          "vld4", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>;
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn), IIC_VLD4,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 
-def  VLD4d8   : VLD4D<0b0000, 0b0000, "8">;
-def  VLD4d16  : VLD4D<0b0000, 0b0100, "16">;
-def  VLD4d32  : VLD4D<0b0000, 0b1000, "32">;
+def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
+def  VLD4d16  : VLD4D<0b0000, {0,1,?,?}, "16">;
+def  VLD4d32  : VLD4D<0b0000, {1,0,?,?}, "32">;
 
-def  VLD4d8Pseudo  : VLDQQPseudo;
-def  VLD4d16Pseudo : VLDQQPseudo;
-def  VLD4d32Pseudo : VLDQQPseudo;
+def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
 
 // ...with address register writeback:
 class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD4,
-          "vld4", Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
-
-def VLD4d8_UPD  : VLD4DWB<0b0000, 0b0000, "8">;
-def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">;
-def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">;
-
-def VLD4d8Pseudo_UPD  : VLDQQWBPseudo;
-def VLD4d16Pseudo_UPD : VLDQQWBPseudo;
-def VLD4d32Pseudo_UPD : VLDQQWBPseudo;
-
-// ...with double-spaced registers (non-updating versions for disassembly only):
-def VLD4q8      : VLD4D<0b0001, 0b0000, "8">;
-def VLD4q16     : VLD4D<0b0001, 0b0100, "16">;
-def VLD4q32     : VLD4D<0b0001, 0b1000, "32">;
-def VLD4q8_UPD  : VLD4DWB<0b0001, 0b0000, "8">;
-def VLD4q16_UPD : VLD4DWB<0b0001, 0b0100, "16">;
-def VLD4q32_UPD : VLD4DWB<0b0001, 0b1000, "32">;
-
-def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo;
-def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo;
-def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo;
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
+def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
+def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+
+// ...with double-spaced registers:
+def VLD4q8      : VLD4D<0b0001, {0,0,?,?}, "8">;
+def VLD4q16     : VLD4D<0b0001, {0,1,?,?}, "16">;
+def VLD4q32     : VLD4D<0b0001, {1,0,?,?}, "32">;
+def VLD4q8_UPD  : VLD4DWB<0b0001, {0,0,?,?}, "8">;
+def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
+def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo;
-def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo;
-def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo;
+def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+
+def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+// Classes for VLD*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst),
+                (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst),
+                (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst),
+                (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
 
 //   VLD1LN   : Vector Load (single element to one lane)
-//   FIXME: Not yet implemented.
+class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+}
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+  let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
+                                               (i32 (LoadOp addrmode6:$addr)),
+                                               imm:$lane))];
+}
+
+def VLD1LNd8  : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VLD1LNd32 : VLD1LN<0b1000, {?,0,?,?}, "32", v2i32, load> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{4};
+  let Inst{4} = Rn{4};
+}
+
+def VLD1LNq8Pseudo  : VLD1QLNPseudo<v16i8, extloadi8>;
+def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
+def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
+
+def : Pat<(vector_insert (v2f32 DPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v4f32 QPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+
+// ...with address register writeback:
+class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$src = $Vd, $Rn.addr = $wb", []>;
+
+def VLD1LNd8_UPD  : VLD1LNWB<0b0000, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{4};
+  let Inst{4} = Rn{4};
+}
+
+def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
 
 //   VLD2LN   : Vector Load (single 2-element structure to one lane)
 class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
-          IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr",
-          "$src1 = $dst1, $src2 = $dst2", []>;
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+          IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+}
 
-def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8">;
-def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">;
-def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">;
+def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
 
 // ...with double-spaced registers:
-def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">;
-def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">;
+def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
 
-// ...alternate versions to be allocated odd register numbers:
-def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">;
-def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">;
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
 
 // ...with address register writeback:
 class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt,
-          "\\{$dst1[$lane], $dst2[$lane]\\}, $addr$offset",
-          "$src1 = $dst1, $src2 = $dst2, $addr.addr = $wb", []>;
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt,
+          "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+}
 
-def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8">;
-def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">;
-def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">;
+def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
 
-def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">;
-def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">;
+def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+
+def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
 
 //   VLD3LN   : Vector Load (single 3-element structure to one lane)
 class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
-          nohash_imm:$lane), IIC_VLD3, "vld3", Dt,
-          "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr",
-          "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>;
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
+          nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+  let Rm = 0b1111;
+}
 
-def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8">;
-def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">;
-def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">;
+def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
 
 // ...with double-spaced registers:
-def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">;
-def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">;
+def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
 
-// ...alternate versions to be allocated odd register numbers:
-def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">;
-def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">;
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
 
 // ...with address register writeback:
 class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, op11_8, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
-          IIC_VLD3, "vld3", Dt,
-          "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr$offset",
-          "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb",
+          IIC_VLD3lnu, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
           []>;
 
-def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8">;
-def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">;
-def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">;
+def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
 
-def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">;
-def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">;
+def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+
+def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
 
 //   VLD4LN   : Vector Load (single 4-element structure to one lane)
 class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, op11_8, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
-          nohash_imm:$lane), IIC_VLD4, "vld4", Dt,
-          "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr",
-          "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>;
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+          nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+}
 
-def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8">;
-def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">;
-def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">;
+def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
 
 // ...with double-spaced registers:
-def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">;
-def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">;
+def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
 
-// ...alternate versions to be allocated odd register numbers:
-def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">;
-def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">;
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
 
 // ...with address register writeback:
 class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, op11_8, op7_4,
-          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
-          IIC_VLD4, "vld4", Dt,
-"\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr$offset",
-"$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $addr.addr = $wb",
-          []>;
+          IIC_VLD4lnu, "vld4", Dt,
+"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm",
+"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb",
+          []> {
+  let Inst{4}   = Rn{4};
+}
 
-def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8">;
-def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">;
-def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">;
+def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
 
-def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">;
-def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">;
+def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+
+def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
+class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd), (ins addrmode6dup:$Rn),
+          IIC_VLD1dup, "vld1", Dt, "\\{$Vd[]\\}, $Rn", "",
+          [(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
+class VLD1QDUPPseudo<ValueType Ty, PatFrag LoadOp> : VLDQPseudo<IIC_VLD1dup> {
+  let Pattern = [(set QPR:$dst,
+                      (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$addr)))))];
+}
+
+def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8>;
+def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16>;
+def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load>;
+
+def VLD1DUPq8Pseudo  : VLD1QDUPPseudo<v16i8, extloadi8>;
+def VLD1DUPq16Pseudo : VLD1QDUPPseudo<v8i16, extloadi16>;
+def VLD1DUPq32Pseudo : VLD1QDUPPseudo<v4i32, load>;
+
+def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPd32 addrmode6:$addr)>;
+def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPq32Pseudo addrmode6:$addr)>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+
+class VLD1QDUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6dup:$Rn), IIC_VLD1dup,
+          "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
+
+def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8">;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16">;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD1DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
+          "vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+class VLD1QDUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
+          "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+
+def VLD1DUPd8_UPD  : VLD1DUPWB<{0,0,0,0}, "8">;
+def VLD1DUPd16_UPD : VLD1DUPWB<{0,1,0,?}, "16">;
+def VLD1DUPd32_UPD : VLD1DUPWB<{1,0,0,?}, "32">;
+
+def VLD1DUPq8_UPD  : VLD1QDUPWB<{0,0,1,0}, "8">;
+def VLD1DUPq16_UPD : VLD1QDUPWB<{0,1,1,?}, "16">;
+def VLD1DUPq32_UPD : VLD1QDUPWB<{1,0,1,?}, "32">;
+
+def VLD1DUPq8Pseudo_UPD  : VLDQWBPseudo<IIC_VLD1dupu>;
+def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
+def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>;
+
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
+class VLD2DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6dup:$Rn), IIC_VLD2dup,
+          "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
+
+def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8">;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16">;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32">;
+
+def VLD2DUPd8Pseudo  : VLDQPseudo<IIC_VLD2dup>;
+def VLD2DUPd16Pseudo : VLDQPseudo<IIC_VLD2dup>;
+def VLD2DUPd32Pseudo : VLDQPseudo<IIC_VLD2dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8">;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16">;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD2DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD2dupu,
+          "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+
+def VLD2DUPd8_UPD  : VLD2DUPWB<{0,0,0,0}, "8">;
+def VLD2DUPd16_UPD : VLD2DUPWB<{0,1,0,?}, "16">;
+def VLD2DUPd32_UPD : VLD2DUPWB<{1,0,0,?}, "32">;
+
+def VLD2DUPd8x2_UPD  : VLD2DUPWB<{0,0,1,0}, "8">;
+def VLD2DUPd16x2_UPD : VLD2DUPWB<{0,1,1,?}, "16">;
+def VLD2DUPd32x2_UPD : VLD2DUPWB<{1,0,1,?}, "32">;
+
+def VLD2DUPd8Pseudo_UPD  : VLDQWBPseudo<IIC_VLD2dupu>;
+def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
+def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>;
+
 //   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
+class VLD3DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6dup:$Rn), IIC_VLD3dup,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
+
+def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
+def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
+def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
+
+def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD3DUPd8x2  : VLD3DUP<{0,0,1,?}, "8">;
+def VLD3DUPd16x2 : VLD3DUP<{0,1,1,?}, "16">;
+def VLD3DUPd32x2 : VLD3DUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD3DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+
+def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8">;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">;
+
+def VLD3DUPd8x2_UPD  : VLD3DUPWB<{0,0,1,0}, "8">;
+def VLD3DUPd16x2_UPD : VLD3DUPWB<{0,1,1,?}, "16">;
+def VLD3DUPd32x2_UPD : VLD3DUPWB<{1,0,1,?}, "32">;
+
+def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+
 //   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
-//   FIXME: Not yet implemented.
+class VLD4DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6dup:$Rn), IIC_VLD4dup,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
+
+def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
+def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
+def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD4DUPd8x2  : VLD4DUP<{0,0,1,?}, "8">;
+def VLD4DUPd16x2 : VLD4DUP<{0,1,1,?}, "16">;
+def VLD4DUPd32x2 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+// ...with address register writeback:
+class VLD4DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+
+def VLD4DUPd8_UPD  : VLD4DUPWB<{0,0,0,0}, "8">;
+def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">;
+def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8x2_UPD  : VLD4DUPWB<{0,0,1,0}, "8">;
+def VLD4DUPd16x2_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
+def VLD4DUPd32x2_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+
 } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
 
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 
 // Classes for VST* pseudo-instructions with multi-register operands.
 // These are expanded to real instructions after register allocation.
-class VSTQPseudo
-  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), IIC_VST, "">;
-class VSTQWBPseudo
+class VSTQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">;
+class VSTQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs GPR:$wb),
-                (ins addrmode6:$addr, am6offset:$offset, QPR:$src), IIC_VST,
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin,
                 "$addr.addr = $wb">;
-class VSTQQPseudo
-  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), IIC_VST, "">;
-class VSTQQWBPseudo
+class VSTQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">;
+class VSTQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs GPR:$wb),
-                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), IIC_VST,
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin,
                 "$addr.addr = $wb">;
-class VSTQQQQWBPseudo
+class VSTQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">;
+class VSTQQQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs GPR:$wb),
-                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST,
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
                 "$addr.addr = $wb">;
 
 //   VST1     : Vector Store (multiple single elements)
 class VST1D<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
-          "vst1", Dt, "\\{$src\\}, $addr", "", []>;
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd),
+          IIC_VST1, "vst1", Dt, "\\{$Vd\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
 class VST1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b00,0b1010,op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
-          "vst1", Dt, "\\{$src1, $src2\\}, $addr", "", []>;
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2), IIC_VST1x2,
+          "vst1", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 
-def  VST1d8   : VST1D<0b0000, "8">;
-def  VST1d16  : VST1D<0b0100, "16">;
-def  VST1d32  : VST1D<0b1000, "32">;
-def  VST1d64  : VST1D<0b1100, "64">;
+def  VST1d8   : VST1D<{0,0,0,?}, "8">;
+def  VST1d16  : VST1D<{0,1,0,?}, "16">;
+def  VST1d32  : VST1D<{1,0,0,?}, "32">;
+def  VST1d64  : VST1D<{1,1,0,?}, "64">;
 
-def  VST1q8   : VST1Q<0b0000, "8">;
-def  VST1q16  : VST1Q<0b0100, "16">;
-def  VST1q32  : VST1Q<0b1000, "32">;
-def  VST1q64  : VST1Q<0b1100, "64">;
+def  VST1q8   : VST1Q<{0,0,?,?}, "8">;
+def  VST1q16  : VST1Q<{0,1,?,?}, "16">;
+def  VST1q32  : VST1Q<{1,0,?,?}, "32">;
+def  VST1q64  : VST1Q<{1,1,?,?}, "64">;
 
-def  VST1q8Pseudo  : VSTQPseudo;
-def  VST1q16Pseudo : VSTQPseudo;
-def  VST1q32Pseudo : VSTQPseudo;
-def  VST1q64Pseudo : VSTQPseudo;
+def  VST1q8Pseudo  : VSTQPseudo<IIC_VST1x2>;
+def  VST1q16Pseudo : VSTQPseudo<IIC_VST1x2>;
+def  VST1q32Pseudo : VSTQPseudo<IIC_VST1x2>;
+def  VST1q64Pseudo : VSTQPseudo<IIC_VST1x2>;
 
 // ...with address register writeback:
 class VST1DWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0111, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset, DPR:$src), IIC_VST,
-          "vst1", Dt, "\\{$src\\}, $addr$offset", "$addr.addr = $wb", []>;
+          (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd), IIC_VST1u,
+          "vst1", Dt, "\\{$Vd\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
 class VST1QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b1010, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset, QPR:$src), IIC_VST,
-          "vst1", Dt, "${src:dregpair}, $addr$offset", "$addr.addr = $wb", []>;
+          (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2),
+          IIC_VST1x2u, "vst1", Dt, "\\{$Vd, $src2\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
 
-def VST1d8_UPD  : VST1DWB<0b0000, "8">;
-def VST1d16_UPD : VST1DWB<0b0100, "16">;
-def VST1d32_UPD : VST1DWB<0b1000, "32">;
-def VST1d64_UPD : VST1DWB<0b1100, "64">;
+def VST1d8_UPD  : VST1DWB<{0,0,0,?}, "8">;
+def VST1d16_UPD : VST1DWB<{0,1,0,?}, "16">;
+def VST1d32_UPD : VST1DWB<{1,0,0,?}, "32">;
+def VST1d64_UPD : VST1DWB<{1,1,0,?}, "64">;
 
-def VST1q8_UPD  : VST1QWB<0b0000, "8">;
-def VST1q16_UPD : VST1QWB<0b0100, "16">;
-def VST1q32_UPD : VST1QWB<0b1000, "32">;
-def VST1q64_UPD : VST1QWB<0b1100, "64">;
+def VST1q8_UPD  : VST1QWB<{0,0,?,?}, "8">;
+def VST1q16_UPD : VST1QWB<{0,1,?,?}, "16">;
+def VST1q32_UPD : VST1QWB<{1,0,?,?}, "32">;
+def VST1q64_UPD : VST1QWB<{1,1,?,?}, "64">;
 
-def VST1q8Pseudo_UPD  : VSTQWBPseudo;
-def VST1q16Pseudo_UPD : VSTQWBPseudo;
-def VST1q32Pseudo_UPD : VSTQWBPseudo;
-def VST1q64Pseudo_UPD : VSTQWBPseudo;
+def VST1q8Pseudo_UPD  : VSTQWBPseudo<IIC_VST1x2u>;
+def VST1q16Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>;
+def VST1q32Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>;
+def VST1q64Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>;
 
 // ...with 3 registers (some of these are only for the disassembler):
 class VST1D3<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>;
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3),
+          IIC_VST1x3, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
 class VST1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, DPR:$src3),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3),
+          IIC_VST1x3u, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
 
-def VST1d8T      : VST1D3<0b0000, "8">;
-def VST1d16T     : VST1D3<0b0100, "16">;
-def VST1d32T     : VST1D3<0b1000, "32">;
-def VST1d64T     : VST1D3<0b1100, "64">;
+def VST1d8T      : VST1D3<{0,0,0,?}, "8">;
+def VST1d16T     : VST1D3<{0,1,0,?}, "16">;
+def VST1d32T     : VST1D3<{1,0,0,?}, "32">;
+def VST1d64T     : VST1D3<{1,1,0,?}, "64">;
 
-def VST1d8T_UPD  : VST1D3WB<0b0000, "8">;
-def VST1d16T_UPD : VST1D3WB<0b0100, "16">;
-def VST1d32T_UPD : VST1D3WB<0b1000, "32">;
-def VST1d64T_UPD : VST1D3WB<0b1100, "64">;
+def VST1d8T_UPD  : VST1D3WB<{0,0,0,?}, "8">;
+def VST1d16T_UPD : VST1D3WB<{0,1,0,?}, "16">;
+def VST1d32T_UPD : VST1D3WB<{1,0,0,?}, "32">;
+def VST1d64T_UPD : VST1D3WB<{1,1,0,?}, "64">;
 
-def VST1d64TPseudo     : VSTQQPseudo;
-def VST1d64TPseudo_UPD : VSTQQWBPseudo;
+def VST1d64TPseudo     : VSTQQPseudo<IIC_VST1x3>;
+def VST1d64TPseudo_UPD : VSTQQWBPseudo<IIC_VST1x3u>;
 
 // ...with 4 registers (some of these are only for the disassembler):
 class VST1D4<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "",
-          []>;
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST1x4, "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", "",
+          []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 class VST1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST1x4u,
+          "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
 
-def VST1d8Q      : VST1D4<0b0000, "8">;
-def VST1d16Q     : VST1D4<0b0100, "16">;
-def VST1d32Q     : VST1D4<0b1000, "32">;
-def VST1d64Q     : VST1D4<0b1100, "64">;
+def VST1d8Q      : VST1D4<{0,0,?,?}, "8">;
+def VST1d16Q     : VST1D4<{0,1,?,?}, "16">;
+def VST1d32Q     : VST1D4<{1,0,?,?}, "32">;
+def VST1d64Q     : VST1D4<{1,1,?,?}, "64">;
 
-def VST1d8Q_UPD  : VST1D4WB<0b0000, "8">;
-def VST1d16Q_UPD : VST1D4WB<0b0100, "16">;
-def VST1d32Q_UPD : VST1D4WB<0b1000, "32">;
-def VST1d64Q_UPD : VST1D4WB<0b1100, "64">;
+def VST1d8Q_UPD  : VST1D4WB<{0,0,?,?}, "8">;
+def VST1d16Q_UPD : VST1D4WB<{0,1,?,?}, "16">;
+def VST1d32Q_UPD : VST1D4WB<{1,0,?,?}, "32">;
+def VST1d64Q_UPD : VST1D4WB<{1,1,?,?}, "64">;
 
-def VST1d64QPseudo     : VSTQQPseudo;
-def VST1d64QPseudo_UPD : VSTQQWBPseudo;
+def VST1d64QPseudo     : VSTQQPseudo<IIC_VST1x4>;
+def VST1d64QPseudo_UPD : VSTQQWBPseudo<IIC_VST1x4u>;
 
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2),
-          IIC_VST, "vst2", Dt, "\\{$src1, $src2\\}, $addr", "", []>;
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2),
+          IIC_VST2, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 class VST2Q<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0011, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST, "vst2", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
-          "", []>;
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST2x2, "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 
-def  VST2d8   : VST2D<0b1000, 0b0000, "8">;
-def  VST2d16  : VST2D<0b1000, 0b0100, "16">;
-def  VST2d32  : VST2D<0b1000, 0b1000, "32">;
+def  VST2d8   : VST2D<0b1000, {0,0,?,?}, "8">;
+def  VST2d16  : VST2D<0b1000, {0,1,?,?}, "16">;
+def  VST2d32  : VST2D<0b1000, {1,0,?,?}, "32">;
 
-def  VST2q8   : VST2Q<0b0000, "8">;
-def  VST2q16  : VST2Q<0b0100, "16">;
-def  VST2q32  : VST2Q<0b1000, "32">;
+def  VST2q8   : VST2Q<{0,0,?,?}, "8">;
+def  VST2q16  : VST2Q<{0,1,?,?}, "16">;
+def  VST2q32  : VST2Q<{1,0,?,?}, "32">;
 
-def  VST2d8Pseudo  : VSTQPseudo;
-def  VST2d16Pseudo : VSTQPseudo;
-def  VST2d32Pseudo : VSTQPseudo;
+def  VST2d8Pseudo  : VSTQPseudo<IIC_VST2>;
+def  VST2d16Pseudo : VSTQPseudo<IIC_VST2>;
+def  VST2d32Pseudo : VSTQPseudo<IIC_VST2>;
 
-def  VST2q8Pseudo  : VSTQQPseudo;
-def  VST2q16Pseudo : VSTQQPseudo;
-def  VST2q32Pseudo : VSTQQPseudo;
+def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
 
 // ...with address register writeback:
 class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset, DPR:$src1, DPR:$src2),
-          IIC_VST, "vst2", Dt, "\\{$src1, $src2\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+          (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2),
+          IIC_VST2u, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
 class VST2QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST, "vst2", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST2x2u,
+          "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
 
-def VST2d8_UPD  : VST2DWB<0b1000, 0b0000, "8">;
-def VST2d16_UPD : VST2DWB<0b1000, 0b0100, "16">;
-def VST2d32_UPD : VST2DWB<0b1000, 0b1000, "32">;
+def VST2d8_UPD  : VST2DWB<0b1000, {0,0,?,?}, "8">;
+def VST2d16_UPD : VST2DWB<0b1000, {0,1,?,?}, "16">;
+def VST2d32_UPD : VST2DWB<0b1000, {1,0,?,?}, "32">;
 
-def VST2q8_UPD  : VST2QWB<0b0000, "8">;
-def VST2q16_UPD : VST2QWB<0b0100, "16">;
-def VST2q32_UPD : VST2QWB<0b1000, "32">;
+def VST2q8_UPD  : VST2QWB<{0,0,?,?}, "8">;
+def VST2q16_UPD : VST2QWB<{0,1,?,?}, "16">;
+def VST2q32_UPD : VST2QWB<{1,0,?,?}, "32">;
 
-def VST2d8Pseudo_UPD  : VSTQWBPseudo;
-def VST2d16Pseudo_UPD : VSTQWBPseudo;
-def VST2d32Pseudo_UPD : VSTQWBPseudo;
+def VST2d8Pseudo_UPD  : VSTQWBPseudo<IIC_VST2u>;
+def VST2d16Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>;
+def VST2d32Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>;
 
-def VST2q8Pseudo_UPD  : VSTQQWBPseudo;
-def VST2q16Pseudo_UPD : VSTQQWBPseudo;
-def VST2q32Pseudo_UPD : VSTQQWBPseudo;
+def VST2q8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q16Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>;
+def VST2q32Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>;
 
 // ...with double-spaced registers (for disassembly only):
-def VST2b8      : VST2D<0b1001, 0b0000, "8">;
-def VST2b16     : VST2D<0b1001, 0b0100, "16">;
-def VST2b32     : VST2D<0b1001, 0b1000, "32">;
-def VST2b8_UPD  : VST2DWB<0b1001, 0b0000, "8">;
-def VST2b16_UPD : VST2DWB<0b1001, 0b0100, "16">;
-def VST2b32_UPD : VST2DWB<0b1001, 0b1000, "32">;
+def VST2b8      : VST2D<0b1001, {0,0,?,?}, "8">;
+def VST2b16     : VST2D<0b1001, {0,1,?,?}, "16">;
+def VST2b32     : VST2D<0b1001, {1,0,?,?}, "32">;
+def VST2b8_UPD  : VST2DWB<0b1001, {0,0,?,?}, "8">;
+def VST2b16_UPD : VST2DWB<0b1001, {0,1,?,?}, "16">;
+def VST2b32_UPD : VST2DWB<0b1001, {1,0,?,?}, "32">;
 
 //   VST3     : Vector Store (multiple 3-element structures)
 class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
-          "vst3", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>;
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
 
-def  VST3d8   : VST3D<0b0100, 0b0000, "8">;
-def  VST3d16  : VST3D<0b0100, 0b0100, "16">;
-def  VST3d32  : VST3D<0b0100, 0b1000, "32">;
+def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
+def  VST3d16  : VST3D<0b0100, {0,1,0,?}, "16">;
+def  VST3d32  : VST3D<0b0100, {1,0,0,?}, "32">;
 
-def  VST3d8Pseudo  : VSTQQPseudo;
-def  VST3d16Pseudo : VSTQQPseudo;
-def  VST3d32Pseudo : VSTQQPseudo;
+def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>;
+def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
+def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
 
 // ...with address register writeback:
 class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
-          "vst3", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
-
-def VST3d8_UPD  : VST3DWB<0b0100, 0b0000, "8">;
-def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">;
-def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">;
-
-def VST3d8Pseudo_UPD  : VSTQQWBPseudo;
-def VST3d16Pseudo_UPD : VSTQQWBPseudo;
-def VST3d32Pseudo_UPD : VSTQQWBPseudo;
-
-// ...with double-spaced registers (non-updating versions for disassembly only):
-def VST3q8      : VST3D<0b0101, 0b0000, "8">;
-def VST3q16     : VST3D<0b0101, 0b0100, "16">;
-def VST3q32     : VST3D<0b0101, 0b1000, "32">;
-def VST3q8_UPD  : VST3DWB<0b0101, 0b0000, "8">;
-def VST3q16_UPD : VST3DWB<0b0101, 0b0100, "16">;
-def VST3q32_UPD : VST3DWB<0b0101, 0b1000, "32">;
-
-def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo;
-def VST3q16Pseudo_UPD : VSTQQQQWBPseudo;
-def VST3q32Pseudo_UPD : VSTQQQQWBPseudo;
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
+
+def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
+def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
+def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+
+// ...with double-spaced registers:
+def VST3q8      : VST3D<0b0101, {0,0,0,?}, "8">;
+def VST3q16     : VST3D<0b0101, {0,1,0,?}, "16">;
+def VST3q32     : VST3D<0b0101, {1,0,0,?}, "32">;
+def VST3q8_UPD  : VST3DWB<0b0101, {0,0,0,?}, "8">;
+def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
+def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo;
-def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo;
-def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo;
+def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>;
+def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+
+def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
 
 //   VST4     : Vector Store (multiple 4-element structures)
 class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST, "vst4", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
-          "", []>;
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+}
 
-def  VST4d8   : VST4D<0b0000, 0b0000, "8">;
-def  VST4d16  : VST4D<0b0000, 0b0100, "16">;
-def  VST4d32  : VST4D<0b0000, 0b1000, "32">;
+def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
+def  VST4d16  : VST4D<0b0000, {0,1,?,?}, "16">;
+def  VST4d32  : VST4D<0b0000, {1,0,?,?}, "32">;
 
-def  VST4d8Pseudo  : VSTQQPseudo;
-def  VST4d16Pseudo : VSTQQPseudo;
-def  VST4d32Pseudo : VSTQQPseudo;
+def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>;
+def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
+def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
 
 // ...with address register writeback:
 class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST,
-           "vst4", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
-
-def VST4d8_UPD  : VST4DWB<0b0000, 0b0000, "8">;
-def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">;
-def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">;
-
-def VST4d8Pseudo_UPD  : VSTQQWBPseudo;
-def VST4d16Pseudo_UPD : VSTQQWBPseudo;
-def VST4d32Pseudo_UPD : VSTQQWBPseudo;
-
-// ...with double-spaced registers (non-updating versions for disassembly only):
-def VST4q8      : VST4D<0b0001, 0b0000, "8">;
-def VST4q16     : VST4D<0b0001, 0b0100, "16">;
-def VST4q32     : VST4D<0b0001, 0b1000, "32">;
-def VST4q8_UPD  : VST4DWB<0b0001, 0b0000, "8">;
-def VST4q16_UPD : VST4DWB<0b0001, 0b0100, "16">;
-def VST4q32_UPD : VST4DWB<0b0001, 0b1000, "32">;
-
-def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo;
-def VST4q16Pseudo_UPD : VSTQQQQWBPseudo;
-def VST4q32Pseudo_UPD : VSTQQQQWBPseudo;
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
+           "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
+def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
+def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+
+// ...with double-spaced registers:
+def VST4q8      : VST4D<0b0001, {0,0,?,?}, "8">;
+def VST4q16     : VST4D<0b0001, {0,1,?,?}, "16">;
+def VST4q32     : VST4D<0b0001, {1,0,?,?}, "32">;
+def VST4q8_UPD  : VST4DWB<0b0001, {0,0,?,?}, "8">;
+def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
+def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo;
-def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo;
-def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo;
+def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>;
+def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+
+def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+
+// Classes for VST*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
 
 //   VST1LN   : Vector Store (single element from one lane)
-//   FIXME: Not yet implemented.
+class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag StoreOp, SDNode ExtractOp>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, nohash_imm:$lane),
+          IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> {
+  let Rm = 0b1111;
+}
+class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNPseudo<IIC_VST1ln> {
+  let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                          addrmode6:$addr)];
+}
+
+def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
+                       NEONvgetlaneu> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
+                       NEONvgetlaneu> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{5};
+}
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo  : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>;
+def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>;
+def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>;
+
+def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+
+// ...with address register writeback:
+class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []>;
+
+def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{5};
+}
+def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST1lnu>;
+def VST1LNq16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>;
+def VST1LNq32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST1lnu>;
 
 //   VST2LN   : Vector Store (single 2-element structure from one lane)
 class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
-          IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr",
-          "", []>;
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
+          IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+}
 
-def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8">;
-def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">;
-def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">;
+def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
 
 // ...with double-spaced registers:
-def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">;
-def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">;
+def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{4}   = Rn{4};
+}
 
-// ...alternate versions to be allocated odd register numbers:
-def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">;
-def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">;
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
 
 // ...with address register writeback:
 class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt,
+           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt,
           "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+          "$addr.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+}
 
-def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8">;
-def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">;
-def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">;
+def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+
+def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
 
-def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">;
-def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">;
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
 class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
-           nohash_imm:$lane), IIC_VST, "vst3", Dt,
-          "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>;
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
+           nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+}
+
+def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
 
-def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8">;
-def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">;
-def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">;
+def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
 
 // ...with double-spaced registers:
-def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">;
-def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">;
+def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
 
-// ...alternate versions to be allocated odd register numbers:
-def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">;
-def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">;
+def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
 
 // ...with address register writeback:
 class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
-          IIC_VST, "vst3", Dt,
-          "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+          IIC_VST3lnu, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []>;
+
+def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
 
-def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8">;
-def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">;
-def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">;
+def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+
+def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
 
-def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">;
-def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">;
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
 
 //   VST4LN   : Vector Store (single 4-element structure from one lane)
 class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
-           nohash_imm:$lane), IIC_VST, "vst4", Dt,
-          "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr",
-          "", []>;
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
+           nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+}
+
+def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
 
-def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8">;
-def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">;
-def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">;
+def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
 
 // ...with double-spaced registers:
-def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">;
-def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">;
+def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
 
-// ...alternate versions to be allocated odd register numbers:
-def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">;
-def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">;
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
 
 // ...with address register writeback:
 class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$addr, am6offset:$offset,
-           DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
-          IIC_VST, "vst4", Dt,
-  "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr$offset",
-          "$addr.addr = $wb", []>;
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+          IIC_VST4lnu, "vst4", Dt,
+  "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+}
 
-def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8">;
-def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">;
-def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">;
+def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+
+def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
 
-def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">;
-def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">;
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
 
 } // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
 
@@ -1000,98 +1687,92 @@ def SubReg_i32_lane : SDNodeXForm<imm, [{
 // Instruction Classes
 //===----------------------------------------------------------------------===//
 
-// Basic 2-register operations: single-, double- and quad-register.
-class N2VS<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
-           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
-        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src),
-        IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", []>;
+// Basic 2-register operations: double- and quad-register.
 class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt,"$dst, $src", "",
-        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>;
 class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt,"$dst, $src", "",
-        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>;
 
 // Basic 2-register intrinsics, both double- and quad-register.
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
 // Narrow 2-register operations.
 class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType TyD, ValueType TyQ, SDNode OpNode>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst),
-        (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (TyD (OpNode (TyQ QPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>;
 
 // Narrow 2-register intrinsics.
 class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType TyD, ValueType TyQ, Intrinsic IntOp>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst),
-        (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
 
 // Long 2-register operations (currently only used for VMOVL).
 class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType TyQ, ValueType TyD, SDNode OpNode>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst),
-        (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>;
+
+// Long 2-register intrinsics.
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
 
 // 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
 class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
-  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$dst1, DPR:$dst2),
-        (ins DPR:$src1, DPR:$src2), IIC_VPERMD, 
-        OpcodeStr, Dt, "$dst1, $dst2",
-        "$src1 = $dst1, $src2 = $dst2", []>;
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm),
+        (ins DPR:$src1, DPR:$src2), IIC_VPERMD,
+        OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
 class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
                   InstrItinClass itin, string OpcodeStr, string Dt>
-  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2),
-        (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$dst1, $dst2",
-        "$src1 = $dst1, $src2 = $dst2", []>;
-
-// Basic 3-register operations: single-, double- and quad-register.
-class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-           string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-           SDNode OpNode, bit Commutable>
-  : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm,
-        IIC_VBIND, OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
-  let isCommutable = Commutable;
-}
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm),
+        (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
 
+// Basic 3-register operations: double- and quad-register.
 class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
 // Same as N3VD but no data type.
@@ -1100,31 +1781,31 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 0, op4,
-         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, 
-         OpcodeStr, "$dst, $src1, $src2", "",
-         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{
+         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{
   let isCommutable = Commutable;
 }
 
-class N3VDSL<bits<2> op21_20, bits<4> op11_8, 
+class N3VDSL<bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
-        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (Ty DPR:$dst),
-              (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2),imm:$lane)))))]> {
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
   let isCommutable = 0;
 }
-class N3VDSL16<bits<2> op21_20, bits<4> op11_8, 
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
-        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","",
-        [(set (Ty DPR:$dst),
-              (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> {
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
   let isCommutable = 0;
 }
 
@@ -1132,40 +1813,40 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, 
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
 class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 1, op4,
-         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, 
-         OpcodeStr, "$dst, $src1, $src2", "",
-         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{
+         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{
   let isCommutable = Commutable;
 }
-class N3VQSL<bits<2> op21_20, bits<4> op11_8, 
+class N3VQSL<bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (ResTy QPR:$dst),
-              (ResTy (ShOp (ResTy QPR:$src1),
-                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
                                                 imm:$lane)))))]> {
   let isCommutable = 0;
 }
 class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","",
-        [(set (ResTy QPR:$dst),
-              (ResTy (ShOp (ResTy QPR:$src1),
-                           (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
                                                 imm:$lane)))))]> {
   let isCommutable = 0;
 }
@@ -1175,30 +1856,39 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), f, itin,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
-class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, 
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
-        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (Ty DPR:$dst),
-              (Ty (IntOp (Ty DPR:$src1),
-                         (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
                                            imm:$lane)))))]> {
   let isCommutable = 0;
 }
 class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
-        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (Ty DPR:$dst),
-              (Ty (IntOp (Ty DPR:$src1),
-                         (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> {
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> {
   let isCommutable = 0;
 }
 
@@ -1206,20 +1896,20 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), f, itin,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
-class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, 
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (ResTy QPR:$dst),
-              (ResTy (IntOp (ResTy QPR:$src1),
-                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
                                                  imm:$lane)))))]> {
   let isCommutable = 0;
 }
@@ -1227,93 +1917,95 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (ResTy QPR:$dst),
-              (ResTy (IntOp (ResTy QPR:$src1),
-                            (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
                                                  imm:$lane)))))]> {
   let isCommutable = 0;
 }
+class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> {
+  let isCommutable = 0;
+}
 
-// Multiply-Add/Sub operations: single-, double- and quad-register.
-class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-                InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, SDNode MulOp, SDNode OpNode>
-  : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR_VFP2:$dst),
-        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>;
-
+// Multiply-Add/Sub operations: double- and quad-register.
 class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, SDNode MulOp, SDNode OpNode>
+                ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set DPR:$dst, (Ty (OpNode DPR:$src1,
-                             (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>;
+
 class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt,
-                  ValueType Ty, SDNode MulOp, SDNode ShOp>
+                  ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
-        (outs DPR:$dst),
-        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set (Ty DPR:$dst),
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
               (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (MulOp DPR:$src2,
-                                   (Ty (NEONvduplane (Ty DPR_VFP2:$src3),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
                                                      imm:$lane)))))))]>;
 class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     string OpcodeStr, string Dt,
                     ValueType Ty, SDNode MulOp, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
-        (outs DPR:$dst),
-        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set (Ty DPR:$dst),
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
               (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (MulOp DPR:$src2,
-                                   (Ty (NEONvduplane (Ty DPR_8:$src3),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_8:$Vm),
                                                      imm:$lane)))))))]>;
 
 class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
-                SDNode MulOp, SDNode OpNode>
+                SDPatternOperator MulOp, SDPatternOperator OpNode>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set QPR:$dst, (Ty (OpNode QPR:$src1,
-                             (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
 class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-                  SDNode MulOp, SDNode ShOp>
+                  SDPatternOperator MulOp, SDPatternOperator ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst),
-        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set (ResTy QPR:$dst),
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
               (ResTy (ShOp (ResTy QPR:$src1),
-                           (ResTy (MulOp QPR:$src2,
-                                   (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
                                                         imm:$lane)))))))]>;
 class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     string OpcodeStr, string Dt,
                     ValueType ResTy, ValueType OpTy,
                     SDNode MulOp, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst),
-        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set (ResTy QPR:$dst),
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
               (ResTy (ShOp (ResTy QPR:$src1),
-                           (ResTy (MulOp QPR:$src2,
-                                   (ResTy (NEONvduplane (OpTy DPR_8:$src3),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
                                                         imm:$lane)))))))]>;
 
 // Neon Intrinsic-Op instructions (VABA): double- and quad-register.
@@ -1321,18 +2013,18 @@ class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
                 ValueType Ty, Intrinsic IntOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set DPR:$dst, (Ty (OpNode DPR:$src1,
-                             (Ty (IntOp (Ty DPR:$src2), (Ty DPR:$src3))))))]>;
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
 class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
                 ValueType Ty, Intrinsic IntOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set QPR:$dst, (Ty (OpNode QPR:$src1,
-                             (Ty (IntOp (Ty QPR:$src2), (Ty QPR:$src3))))))]>;
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>;
 
 // Neon 3-argument intrinsics, both double- and quad-register.
 // The destination register is also used as the first source operand register.
@@ -1340,52 +2032,52 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
-                                      (OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1),
+                                      (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
-                                      (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1),
+                                      (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
 
 // Long Multiply-Add/Sub operations.
 class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
                 ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
-                                (TyQ (MulOp (TyD DPR:$src2),
-                                            (TyD DPR:$src3)))))]>;
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (MulOp (TyD DPR:$Vn),
+                                            (TyD DPR:$Vm)))))]>;
 class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
                   InstrItinClass itin, string OpcodeStr, string Dt,
                   ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set QPR:$dst,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set QPR:$Vd,
           (OpNode (TyQ QPR:$src1),
-                  (TyQ (MulOp (TyD DPR:$src2),
-                              (TyD (NEONvduplane (TyD DPR_VFP2:$src3),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),
                                                  imm:$lane))))))]>;
 class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                     InstrItinClass itin, string OpcodeStr, string Dt,
                     ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set QPR:$dst,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set QPR:$Vd,
           (OpNode (TyQ QPR:$src1),
-                  (TyQ (MulOp (TyD DPR:$src2),
-                              (TyD (NEONvduplane (TyD DPR_8:$src3),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_8:$Vm),
                                                  imm:$lane))))))]>;
 
 // Long Intrinsic-Op vector operations with explicit extend (VABAL).
@@ -1394,11 +2086,11 @@ class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                    ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
                    SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
-                                (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src2),
-                                                        (TyD DPR:$src3)))))))]>;
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                        (TyD DPR:$Vm)))))))]>;
 
 // Neon Long 3-argument intrinsic.  The destination register is
 // a quad-register and is also used as the first source operand register.
@@ -1406,35 +2098,35 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType TyQ, ValueType TyD, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
-        [(set QPR:$dst,
-          (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
 class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  string OpcodeStr, string Dt,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set (ResTy QPR:$dst),
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
               (ResTy (IntOp (ResTy QPR:$src1),
-                            (OpTy DPR:$src2),
-                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
                                                 imm:$lane)))))]>;
 class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                    InstrItinClass itin, string OpcodeStr, string Dt,
                    ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
-        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
-        [(set (ResTy QPR:$dst),
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
               (ResTy (IntOp (ResTy QPR:$src1),
-                            (OpTy DPR:$src2),
-                            (OpTy (NEONvduplane (OpTy DPR_8:$src3),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
                                                 imm:$lane)))))]>;
 
 // Narrowing 3-register intrinsics.
@@ -1442,9 +2134,9 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINi4D,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
+        (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
 
@@ -1453,29 +2145,29 @@ class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src1), (TyD DPR:$src2))))]> {
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
 class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType TyQ, ValueType TyD, SDNode OpNode>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set QPR:$dst,
-          (TyQ (OpNode (TyD DPR:$src1),
-                       (TyD (NEONvduplane (TyD DPR_VFP2:$src2),imm:$lane)))))]>;
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
 class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType TyQ, ValueType TyD, SDNode OpNode>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set QPR:$dst,
-          (TyQ (OpNode (TyD DPR:$src1),
-                       (TyD (NEONvduplane (TyD DPR_8:$src2), imm:$lane)))))]>;
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
 
 // Long 3-register operations with explicitly extended operands.
 class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -1483,10 +2175,10 @@ class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
               bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (OpNode (TyQ (ExtOp (TyD DPR:$src1))),
-                                (TyQ (ExtOp (TyD DPR:$src2)))))]> {
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
   let isCommutable = Commutable;
 }
 
@@ -1496,10 +2188,10 @@ class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                  ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
                  bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src1),
-                                                (TyD DPR:$src2))))))]> {
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                (TyD DPR:$Vm))))))]> {
   let isCommutable = Commutable;
 }
 
@@ -1508,30 +2200,30 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (ResTy QPR:$dst),
-              (ResTy (IntOp (OpTy DPR:$src1),
-                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
                                                 imm:$lane)))))]>;
 class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                   InstrItinClass itin, string OpcodeStr, string Dt,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
-        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
-        [(set (ResTy QPR:$dst),
-              (ResTy (IntOp (OpTy DPR:$src1),
-                            (OpTy (NEONvduplane (OpTy DPR_8:$src2),
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
                                                 imm:$lane)))))]>;
 
 // Wide 3-register operations.
@@ -1539,10 +2231,10 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
            SDNode OpNode, SDNode ExtOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
-                                (TyQ (ExtOp (TyD DPR:$src2)))))]> {
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
   let isCommutable = Commutable;
 }
 
@@ -1551,16 +2243,16 @@ class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
 // Pairwise long 2-register accumulate intrinsics,
 // both double- and quad-register.
@@ -1570,17 +2262,17 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  string OpcodeStr, string Dt,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), IIC_VPALiD,
-        OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst",
-        [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>;
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>;
 class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4,
                  string OpcodeStr, string Dt,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VPALiQ,
-        OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst",
-        [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>;
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>;
 
 // Shift by immediate,
 // both double- and quad-register.
@@ -1588,25 +2280,25 @@ class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), f, itin,
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
-           [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
+           (outs DPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>;
 class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), f, itin,
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
-           [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
+           (outs QPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
 
 // Long shift by immediate.
 class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), N2RegVShLFrm,
-           IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
-           [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
+           (outs QPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), N2RegVShLFrm,
+           IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm),
                                           (i32 imm:$SIMM))))]>;
 
 // Narrow shift by immediate.
@@ -1614,42 +2306,42 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), N2RegVShRFrm, itin,
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
-           [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
+           (outs DPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm),
                                           (i32 imm:$SIMM))))]>;
 
 // Shift right by immediate and accumulate,
 // both double- and quad-register.
 class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
-           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
-           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
-           [(set DPR:$dst, (Ty (add DPR:$src1,
-                                (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (add DPR:$src1,
+                                (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>;
 class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
-           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
-           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
-           [(set QPR:$dst, (Ty (add QPR:$src1,
-                                (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (add QPR:$src1,
+                                (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
 
 // Shift by immediate and insert,
 // both double- and quad-register.
 class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
-  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
-           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), f, IIC_VSHLiD,
-           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
-           [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>;
 class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
-  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
-           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), f, IIC_VSHLiQ,
-           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
-           [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiQ,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
 
 // Convert, with fractional bits immediate,
 // both double- and quad-register.
@@ -1657,16 +2349,16 @@ class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), NVCVTFrm,
-           IIC_VUNAD, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
-           [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
+           (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
 class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), NVCVTFrm,
-           IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
-           [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
+           (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclasses
@@ -1678,45 +2370,127 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 //   S = single int (32 bit) elements
 //   D = double int (64 bit) elements
 
-// Neon 2-register vector operations -- for disassembly only.
+// Neon 2-register vector operations and intrinsics.
 
-// First with only element sizes of 8, 16 and 32 bits:
+// Neon 2-register comparisons.
+//   source operand element sizes of 8, 16 and 32 bits:
 multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                        bits<5> op11_7, bit op4, string opc, string Dt,
-                       string asm> {
+                       string asm, SDNode OpNode> {
   // 64-bit vector types.
   def v8i8  : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
-                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
-                  opc, !strconcat(Dt, "8"), asm, "", []>;
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
   def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
-                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
-                  opc, !strconcat(Dt, "16"), asm, "", []>;
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
   def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
-                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
-                  opc, !strconcat(Dt, "32"), asm, "", []>;
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
   def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
-                  (outs DPR:$dst), (ins DPR:$src), NoItinerary,
-                  opc, "f32", asm, "", []> {
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
 
   // 128-bit vector types.
   def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
-                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
-                  opc, !strconcat(Dt, "8"), asm, "", []>;
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
   def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
-                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
-                  opc, !strconcat(Dt, "16"), asm, "", []>;
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
   def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
-                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
-                  opc, !strconcat(Dt, "32"), asm, "", []>;
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
   def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
-                  (outs QPR:$dst), (ins QPR:$src), NoItinerary,
-                  opc, "f32", asm, "", []> {
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
 }
 
+
+// Neon 2-register vector intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                      bits<5> op11_7, bit op4,
+                      InstrItinClass itinD, InstrItinClass itinQ,
+                      string OpcodeStr, string Dt, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>;
+  def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>;
+  def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>;
+  def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>;
+}
+
+
+// Neon Narrowing 2-register vector operations,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                    bits<5> op11_7, bit op6, bit op4,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    SDNode OpNode> {
+  def v8i8  : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "16"),
+                   v8i8, v8i16, OpNode>;
+  def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "32"),
+                   v4i16, v4i32, OpNode>;
+  def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "64"),
+                   v2i32, v2i64, OpNode>;
+}
+
+// Neon Narrowing 2-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op6, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       Intrinsic IntOp> {
+  def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp>;
+  def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp>;
+  def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp>;
+}
+
+
+// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                    string OpcodeStr, string Dt, SDNode OpNode> {
+  def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>;
+  def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+
 // Neon 3-register vector operations.
 
 // First with only element sizes of 8, 16 and 32 bits:
@@ -1726,7 +2500,7 @@ multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                    string OpcodeStr, string Dt,
                    SDNode OpNode, bit Commutable = 0> {
   // 64-bit vector types.
-  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, 
+  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, itinD16,
                    OpcodeStr, !strconcat(Dt, "8"),
                    v8i8, v8i8, OpNode, Commutable>;
   def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
@@ -1775,54 +2549,6 @@ multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 
 
-// Neon Narrowing 2-register vector operations,
-//   source operand element sizes of 16, 32 and 64 bits:
-multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
-                    bits<5> op11_7, bit op6, bit op4, 
-                    InstrItinClass itin, string OpcodeStr, string Dt,
-                    SDNode OpNode> {
-  def v8i8  : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
-                   itin, OpcodeStr, !strconcat(Dt, "16"),
-                   v8i8, v8i16, OpNode>;
-  def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
-                   itin, OpcodeStr, !strconcat(Dt, "32"),
-                   v4i16, v4i32, OpNode>;
-  def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
-                   itin, OpcodeStr, !strconcat(Dt, "64"),
-                   v2i32, v2i64, OpNode>;
-}
-
-// Neon Narrowing 2-register vector intrinsics,
-//   source operand element sizes of 16, 32 and 64 bits:
-multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
-                       bits<5> op11_7, bit op6, bit op4, 
-                       InstrItinClass itin, string OpcodeStr, string Dt,
-                       Intrinsic IntOp> {
-  def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
-                      itin, OpcodeStr, !strconcat(Dt, "16"),
-                      v8i8, v8i16, IntOp>;
-  def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
-                      itin, OpcodeStr, !strconcat(Dt, "32"),
-                      v4i16, v4i32, IntOp>;
-  def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
-                      itin, OpcodeStr, !strconcat(Dt, "64"),
-                      v2i32, v2i64, IntOp>;
-}
-
-
-// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
-//   source operand element sizes of 16, 32 and 64 bits:
-multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
-                    string OpcodeStr, string Dt, SDNode OpNode> {
-  def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
-                   OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>;
-  def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
-                   OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
-  def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
-                   OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
-}
-
-
 // Neon 3-register vector intrinsics.
 
 // First with only element sizes of 16 and 32 bits:
@@ -1847,8 +2573,29 @@ multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       OpcodeStr, !strconcat(Dt, "32"),
                       v4i32, v4i32, IntOp, Commutable>;
 }
+multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp>;
+}
 
-multiclass N3VIntSL_HS<bits<4> op11_8, 
+multiclass N3VIntSL_HS<bits<4> op11_8,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt, Intrinsic IntOp> {
@@ -1877,6 +2624,21 @@ multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       OpcodeStr, !strconcat(Dt, "8"),
                       v16i8, v16i8, IntOp, Commutable>;
 }
+multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      Intrinsic IntOp>
+  : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp> {
+  def v8i8  : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp>;
+  def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp>;
+}
+
 
 // ....then also with element size of 64 bits:
 multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
@@ -1893,6 +2655,20 @@ multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       OpcodeStr, !strconcat(Dt, "64"),
                       v2i64, v2i64, IntOp, Commutable>;
 }
+multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       Intrinsic IntOp>
+  : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp> {
+  def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp>;
+  def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp>;
+}
 
 // Neon Narrowing 3-register vector intrinsics,
 //   source operand element sizes of 16, 32 and 64 bits:
@@ -1920,7 +2696,7 @@ multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
   def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
                    OpcodeStr, !strconcat(Dt, "8"),
                    v8i16, v8i8, OpNode, Commutable>;
-  def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16, 
+  def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
                    OpcodeStr, !strconcat(Dt, "16"),
                    v4i32, v4i16, OpNode, Commutable>;
   def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
@@ -1944,7 +2720,7 @@ multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
   def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
                       OpcodeStr, !strconcat(Dt, "8"),
                       v8i16, v8i8, OpNode, ExtOp, Commutable>;
-  def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16, 
+  def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v4i32, v4i16, OpNode, ExtOp, Commutable>;
   def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,
@@ -1959,7 +2735,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                       InstrItinClass itin16, InstrItinClass itin32,
                       string OpcodeStr, string Dt,
                       Intrinsic IntOp, bit Commutable = 0> {
-  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16, 
+  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v4i32, v4i16, IntOp, Commutable>;
   def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32,
@@ -1970,7 +2746,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
                         InstrItinClass itin, string OpcodeStr, string Dt,
                         Intrinsic IntOp> {
-  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, 
+  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
   def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
@@ -1995,7 +2771,7 @@ multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
   def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
                          OpcodeStr, !strconcat(Dt, "8"),
                          v8i16, v8i8, IntOp, ExtOp, Commutable>;
-  def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin, 
+  def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin,
                          OpcodeStr, !strconcat(Dt, "16"),
                          v4i32, v4i16, IntOp, ExtOp, Commutable>;
   def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
@@ -2044,7 +2820,7 @@ multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>;
 }
 
-multiclass N3VMulOpSL_HS<bits<4> op11_8, 
+multiclass N3VMulOpSL_HS<bits<4> op11_8,
                          InstrItinClass itinD16, InstrItinClass itinD32,
                          InstrItinClass itinQ16, InstrItinClass itinQ32,
                          string OpcodeStr, string Dt, SDNode ShOp> {
@@ -2174,30 +2950,6 @@ multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 
 
-// Neon 2-register vector intrinsics,
-//   element sizes of 8, 16 and 32 bits:
-multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
-                      bits<5> op11_7, bit op4,
-                      InstrItinClass itinD, InstrItinClass itinQ,
-                      string OpcodeStr, string Dt, Intrinsic IntOp> {
-  // 64-bit vector types.
-  def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
-                      itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
-  def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
-                      itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>;
-  def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
-                      itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>;
-
-  // 128-bit vector types.
-  def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
-                      itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>;
-  def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
-                      itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>;
-  def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
-                      itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>;
-}
-
-
 // Neon Pairwise long 2-register intrinsics,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
@@ -2461,9 +3213,9 @@ def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
                         "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
 def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
                         "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
-def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32",
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
                      v2f32, v2f32, fmul, 1>;
-def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32",
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
                      v4f32, v4f32, fmul, 1>;
 defm VMULsl   : N3VSL_HS<0b1000, "vmul", "i", mul>;
 def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
@@ -2491,7 +3243,7 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
 
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
 defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
-                          IIC_VMULi16Q, IIC_VMULi32Q, 
+                          IIC_VMULi16Q, IIC_VMULi32Q,
                           "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
 defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
                             IIC_VMULi16Q, IIC_VMULi32Q,
@@ -2555,15 +3307,19 @@ defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D,
 defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
-                          v2f32, fmul, fadd>;
+                          v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
-                          v4f32, fmul, fadd>;
+                          v4f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
-                            v2f32, fmul, fadd>;
+                            v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
-                            v4f32, v2f32, fmul, fadd>;
+                            v4f32, v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 
 def : Pat<(v8i16 (add (v8i16 QPR:$src1),
                   (mul (v8i16 QPR:$src2),
@@ -2581,14 +3337,15 @@ def : Pat<(v4i32 (add (v4i32 QPR:$src1),
                                       (DSubReg_i32_reg imm:$lane))),
                               (SubReg_i32_lane imm:$lane)))>;
 
-def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
-                  (fmul (v4f32 QPR:$src2),
+def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
                         (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
           (v4f32 (VMLAslfq (v4f32 QPR:$src1),
                            (v4f32 QPR:$src2),
                            (v2f32 (EXTRACT_SUBREG QPR:$src3,
                                    (DSubReg_i32_reg imm:$lane))),
-                           (SubReg_i32_lane imm:$lane)))>;
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
 
 //   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
 defm VMLALs   : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
@@ -2608,15 +3365,19 @@ defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
 defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
-                          v2f32, fmul, fsub>;
+                          v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
-                          v4f32, fmul, fsub>;
+                          v4f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
-                            v2f32, fmul, fsub>;
+                            v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
-                            v4f32, v2f32, fmul, fsub>;
+                            v4f32, v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
 
 def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
                   (mul (v8i16 QPR:$src2),
@@ -2634,13 +3395,14 @@ def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
                                       (DSubReg_i32_reg imm:$lane))),
                               (SubReg_i32_lane imm:$lane)))>;
 
-def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
-                  (fmul (v4f32 QPR:$src2),
+def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
                         (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
           (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
                            (v2f32 (EXTRACT_SUBREG QPR:$src3,
                                    (DSubReg_i32_reg imm:$lane))),
-                           (SubReg_i32_lane imm:$lane)))>;
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
 
 //   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
 defm VMLSLs   : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
@@ -2703,25 +3465,24 @@ def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
                      NEONvceq, 1>;
 def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
                      NEONvceq, 1>;
-// For disassembly only.
+
 defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
-                            "$dst, $src, #0">;
+                            "$Vd, $Vm, #0", NEONvceqz>;
 
 //   VCGE     : Vector Compare Greater Than or Equal
 defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
                         IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
-defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, 
+defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
                         IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
 def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
                      NEONvcge, 0>;
 def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
                      NEONvcge, 0>;
-// For disassembly only.
+
 defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
-                            "$dst, $src, #0">;
-// For disassembly only.
+                            "$Vd, $Vm, #0", NEONvcgez>;
 defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
-                            "$dst, $src, #0">;
+                            "$Vd, $Vm, #0", NEONvclez>;
 
 //   VCGT     : Vector Compare Greater Than
 defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
@@ -2732,12 +3493,11 @@ def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
                      NEONvcgt, 0>;
 def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
                      NEONvcgt, 0>;
-// For disassembly only.
+
 defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
-                            "$dst, $src, #0">;
-// For disassembly only.
+                            "$Vd, $Vm, #0", NEONvcgtz>;
 defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
-                            "$dst, $src, #0">;
+                            "$Vd, $Vm, #0", NEONvcltz>;
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
 def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
@@ -2750,7 +3510,7 @@ def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
 def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
                         "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>;
 //   VTST     : Vector Test Bits
-defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                         IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
 
 // Vector Bitwise Operations.
@@ -2779,104 +3539,190 @@ def  VORRd    : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr",
 def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
                       v4i32, v4i32, or, 1>;
 
+def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+
 //   VBIC     : Vector Bitwise Bit Clear (AND NOT)
-def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                     (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD,
-                     "vbic", "$dst, $src1, $src2", "",
-                     [(set DPR:$dst, (v2i32 (and DPR:$src1,
-                                                 (vnotd DPR:$src2))))]>;
-def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                     (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ,
-                     "vbic", "$dst, $src1, $src2", "",
-                     [(set QPR:$dst, (v4i32 (and QPR:$src1,
-                                                 (vnotq QPR:$src2))))]>;
+def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (and DPR:$Vn,
+                                                 (vnotd DPR:$Vm))))]>;
+def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
+                                                 (vnotq QPR:$Vm))))]>;
+
+def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
 
 //   VORN     : Vector Bitwise OR NOT
-def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
-                     (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD,
-                     "vorn", "$dst, $src1, $src2", "",
-                     [(set DPR:$dst, (v2i32 (or DPR:$src1,
-                                                (vnotd DPR:$src2))))]>;
-def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
-                     (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ,
-                     "vorn", "$dst, $src1, $src2", "",
-                     [(set QPR:$dst, (v4i32 (or QPR:$src1,
-                                                (vnotq QPR:$src2))))]>;
+def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (or DPR:$Vn,
+                                                (vnotd DPR:$Vm))))]>;
+def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (or QPR:$Vn,
+                                                (vnotq QPR:$Vm))))]>;
 
 //   VMVN     : Vector Bitwise NOT (Immediate)
 
 let isReMaterializable = 1 in {
-def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$dst),
+
+def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmvn", "i16", "$dst, $SIMM", "",
-                         [(set DPR:$dst, (v4i16 (NEONvmvnImm timm:$SIMM)))]>;
-def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$dst),
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmvn", "i16", "$dst, $SIMM", "",
-                         [(set QPR:$dst, (v8i16 (NEONvmvnImm timm:$SIMM)))]>;
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
 
-def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$dst),
+def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmvn", "i32", "$dst, $SIMM", "",
-                         [(set DPR:$dst, (v2i32 (NEONvmvnImm timm:$SIMM)))]>;
-def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$dst),
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmvn", "i32", "$dst, $SIMM", "",
-                         [(set QPR:$dst, (v4i32 (NEONvmvnImm timm:$SIMM)))]>;
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
 }
 
 //   VMVN     : Vector Bitwise NOT
 def  VMVNd    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
-                     (outs DPR:$dst), (ins DPR:$src), IIC_VSUBiD,
-                     "vmvn", "$dst, $src", "",
-                     [(set DPR:$dst, (v2i32 (vnotd DPR:$src)))]>;
+                     (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>;
 def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
-                     (outs QPR:$dst), (ins QPR:$src), IIC_VSUBiD,
-                     "vmvn", "$dst, $src", "",
-                     [(set QPR:$dst, (v4i32 (vnotq QPR:$src)))]>;
+                     (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
 def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
 def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
 
 //   VBSL     : Vector Bitwise Select
-def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                     (ins DPR:$src1, DPR:$src2, DPR:$src3),
+def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
                      N3RegFrm, IIC_VCNTiD,
-                     "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
-                     [(set DPR:$dst,
-                       (v2i32 (or (and DPR:$src2, DPR:$src1),
-                                  (and DPR:$src3, (vnotd DPR:$src1)))))]>;
-def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                     (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set DPR:$Vd,
+                       (v2i32 (or (and DPR:$Vn, DPR:$src1),
+                                  (and DPR:$Vm, (vnotd DPR:$src1)))))]>;
+def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
                      N3RegFrm, IIC_VCNTiQ,
-                     "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
-                     [(set QPR:$dst,
-                       (v4i32 (or (and QPR:$src2, QPR:$src1),
-                                  (and QPR:$src3, (vnotq QPR:$src1)))))]>;
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set QPR:$Vd,
+                       (v4i32 (or (and QPR:$Vn, QPR:$src1),
+                                  (and QPR:$Vm, (vnotq QPR:$src1)))))]>;
 
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
 def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
-                     (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
                      N3RegFrm, IIC_VBINiD,
-                     "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                      [/* For disassembly only; pattern left blank */]>;
 def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
-                     (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
                      N3RegFrm, IIC_VBINiQ,
-                     "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                      [/* For disassembly only; pattern left blank */]>;
 
 //   VBIT     : Vector Bitwise Insert if True
 //              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
 def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
-                     (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
                      N3RegFrm, IIC_VBINiD,
-                     "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                      [/* For disassembly only; pattern left blank */]>;
 def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
-                     (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
                      N3RegFrm, IIC_VBINiQ,
-                     "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                      [/* For disassembly only; pattern left blank */]>;
 
 // VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
@@ -2957,8 +3803,8 @@ def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
 def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
                         "vpadd", "i32",
                         v2i32, v2i32, int_arm_neon_vpadd, 0>;
-def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, 
-                        IIC_VBIND, "vpadd", "f32",
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm,
+                        IIC_VPBIND, "vpadd", "f32",
                         v2f32, v2f32, int_arm_neon_vpadd, 0>;
 
 //   VPADDL   : Vector Pairwise Add Long
@@ -2986,7 +3832,7 @@ def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
                         "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
 def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
                         "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
-def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
                         "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
 
 //   VPMIN    : Vector Pairwise Minimum
@@ -3002,16 +3848,16 @@ def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
                         "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
 def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
                         "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
-def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VSUBi4D, "vpmin",
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
                         "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
 
 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
 
 //   VRECPE   : Vector Reciprocal Estimate
-def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, 
+def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
                         IIC_VUNAD, "vrecpe", "u32",
                         v2i32, v2i32, int_arm_neon_vrecpe>;
-def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, 
+def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
                         IIC_VUNAQ, "vrecpe", "u32",
                         v4i32, v4i32, int_arm_neon_vrecpe>;
 def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
@@ -3039,7 +3885,7 @@ def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
 def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
                          IIC_VUNAD, "vrsqrte", "f32",
                          v2f32, v2f32, int_arm_neon_vrsqrte>;
-def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, 
+def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
                          IIC_VUNAQ, "vrsqrte", "f32",
                          v4f32, v4f32, int_arm_neon_vrsqrte>;
 
@@ -3054,12 +3900,12 @@ def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
 // Vector Shifts.
 
 //   VSHL     : Vector Shift
-defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, N3RegVShFrm,
+defm VSHLs    : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm,
                             IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
-                            "vshl", "s", int_arm_neon_vshifts, 0>;
-defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, N3RegVShFrm,
+                            "vshl", "s", int_arm_neon_vshifts>;
+defm VSHLu    : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
                             IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
-                            "vshl", "u", int_arm_neon_vshiftu, 0>;
+                            "vshl", "u", int_arm_neon_vshiftu>;
 //   VSHL     : Vector Shift Left (Immediate)
 defm VSHLi    : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl,
                            N2RegVShLFrm>;
@@ -3093,12 +3939,12 @@ defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
                            NEONvshrn>;
 
 //   VRSHL    : Vector Rounding Shift
-defm VRSHLs   : N3VInt_QHSD<0, 0, 0b0101, 0, N3RegVShFrm,
+defm VRSHLs   : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
-                            "vrshl", "s", int_arm_neon_vrshifts, 0>;
-defm VRSHLu   : N3VInt_QHSD<1, 0, 0b0101, 0, N3RegVShFrm,
+                            "vrshl", "s", int_arm_neon_vrshifts>;
+defm VRSHLu   : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
-                            "vrshl", "u", int_arm_neon_vrshiftu, 0>;
+                            "vrshl", "u", int_arm_neon_vrshiftu>;
 //   VRSHR    : Vector Rounding Shift Right
 defm VRSHRs   : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs,
                            N2RegVShRFrm>;
@@ -3110,12 +3956,12 @@ defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
                            NEONvrshrn>;
 
 //   VQSHL    : Vector Saturating Shift
-defm VQSHLs   : N3VInt_QHSD<0, 0, 0b0100, 1, N3RegVShFrm,
+defm VQSHLs   : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
-                            "vqshl", "s", int_arm_neon_vqshifts, 0>;
-defm VQSHLu   : N3VInt_QHSD<1, 0, 0b0100, 1, N3RegVShFrm,
+                            "vqshl", "s", int_arm_neon_vqshifts>;
+defm VQSHLu   : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
-                            "vqshl", "u", int_arm_neon_vqshiftu, 0>;
+                            "vqshl", "u", int_arm_neon_vqshiftu>;
 //   VQSHL    : Vector Saturating Shift Left (Immediate)
 defm VQSHLsi  : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls,
                            N2RegVShLFrm>;
@@ -3136,12 +3982,12 @@ defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
                            NEONvqshrnsu>;
 
 //   VQRSHL   : Vector Saturating Rounding Shift
-defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, N3RegVShFrm,
+defm VQRSHLs  : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
-                            "vqrshl", "s", int_arm_neon_vqrshifts, 0>;
-defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, N3RegVShFrm,
+                            "vqrshl", "s", int_arm_neon_vqrshifts>;
+defm VQRSHLu  : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
-                            "vqrshl", "u", int_arm_neon_vqrshiftu, 0>;
+                            "vqrshl", "u", int_arm_neon_vqrshiftu>;
 
 //   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
 defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
@@ -3168,7 +4014,7 @@ defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>;
 // Vector Absolute and Saturating Absolute.
 
 //   VABS     : Vector Absolute Value
-defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, 
+defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
                            IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
                            int_arm_neon_vabs>;
 def  VABSfd   : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
@@ -3179,7 +4025,7 @@ def  VABSfq   : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
                         v4f32, v4f32, int_arm_neon_vabs>;
 
 //   VQABS    : Vector Saturating Absolute Value
-defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, 
+defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
                            IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
                            int_arm_neon_vqabs>;
 
@@ -3191,13 +4037,13 @@ def vnegq  : PatFrag<(ops node:$in),
                      (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
 
 class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
-        IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (Ty (vnegd DPR:$src)))]>;
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
+        IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>;
 class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
-        IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (Ty (vnegq QPR:$src)))]>;
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm),
+        IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>;
 
 //   VNEG     : Vector Negate (integer)
 def  VNEGs8d  : VNEGD<0b00, "vneg", "s8", v8i8>;
@@ -3209,13 +4055,13 @@ def  VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>;
 
 //   VNEG     : Vector Negate (floating-point)
 def  VNEGfd   : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
-                    (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD,
-                    "vneg", "f32", "$dst, $src", "",
-                    [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>;
+                    (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>;
 def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
-                    (outs QPR:$dst), (ins QPR:$src), IIC_VUNAQ,
-                    "vneg", "f32", "$dst, $src", "",
-                    [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
+                    (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>;
 
 def : Pat<(v8i8  (vnegd  DPR:$src)), (VNEGs8d DPR:$src)>;
 def : Pat<(v4i16 (vnegd  DPR:$src)), (VNEGs16d DPR:$src)>;
@@ -3225,22 +4071,22 @@ def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>;
 def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>;
 
 //   VQNEG    : Vector Saturating Negate
-defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, 
+defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
                            IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s",
                            int_arm_neon_vqneg>;
 
 // Vector Bit Counting Operations.
 
 //   VCLS     : Vector Count Leading Sign Bits
-defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, 
+defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
                            IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
                            int_arm_neon_vcls>;
 //   VCLZ     : Vector Count Leading Zeros
-defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, 
+defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
                            IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
                            int_arm_neon_vclz>;
 //   VCNT     : Vector Count One Bits
-def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, 
+def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                         IIC_VCNTiD, "vcnt", "8",
                         v8i8, v8i8, int_arm_neon_vcnt>;
 def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
@@ -3249,98 +4095,126 @@ def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
 
 // Vector Swap -- for disassembly only.
 def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
-                     (outs DPR:$dst), (ins DPR:$src), NoItinerary,
-                     "vswp", "$dst, $src", "", []>;
+                     (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                     "vswp", "$Vd, $Vm", "", []>;
 def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
-                     (outs QPR:$dst), (ins QPR:$src), NoItinerary,
-                     "vswp", "$dst, $src", "", []>;
+                     (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                     "vswp", "$Vd, $Vm", "", []>;
 
 // Vector Move Operations.
 
 //   VMOV     : Vector Move (Register)
 
 let neverHasSideEffects = 1 in {
-def  VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
-                     N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>;
-def  VMOVQ    : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
-                     N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>;
+def  VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$Vd), (ins DPR:$Vm),
+                     N3RegFrm, IIC_VMOV, "vmov", "$Vd, $Vm", "", []> {
+  let Vn{4-0} = Vm{4-0};
+}
+def  VMOVQ    : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$Vd), (ins QPR:$Vm),
+                     N3RegFrm, IIC_VMOV, "vmov", "$Vd, $Vm", "", []> {
+  let Vn{4-0} = Vm{4-0};
+}
 
 // Pseudo vector move instructions for QQ and QQQQ registers. This should
 // be expanded after register allocation is completed.
 def  VMOVQQ   : PseudoInst<(outs QQPR:$dst), (ins QQPR:$src),
-                NoItinerary, "${:comment} vmov\t$dst, $src", []>;
+                NoItinerary, []>;
 
 def  VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src),
-                NoItinerary, "${:comment} vmov\t$dst, $src", []>;
+                NoItinerary, []>;
 } // neverHasSideEffects
 
 //   VMOV     : Vector Move (Immediate)
 
 let isReMaterializable = 1 in {
-def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
+def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i8", "$dst, $SIMM", "",
-                         [(set DPR:$dst, (v8i8 (NEONvmovImm timm:$SIMM)))]>;
-def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i8", "$dst, $SIMM", "",
-                         [(set QPR:$dst, (v16i8 (NEONvmovImm timm:$SIMM)))]>;
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>;
 
-def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$dst),
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i16", "$dst, $SIMM", "",
-                         [(set DPR:$dst, (v4i16 (NEONvmovImm timm:$SIMM)))]>;
-def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$dst),
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i16", "$dst, $SIMM", "",
-                         [(set QPR:$dst, (v8i16 (NEONvmovImm timm:$SIMM)))]>;
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
 
-def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$dst),
+def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i32", "$dst, $SIMM", "",
-                         [(set DPR:$dst, (v2i32 (NEONvmovImm timm:$SIMM)))]>;
-def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$dst),
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i32", "$dst, $SIMM", "",
-                         [(set QPR:$dst, (v4i32 (NEONvmovImm timm:$SIMM)))]>;
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
 
-def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst),
+def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i64", "$dst, $SIMM", "",
-                         [(set DPR:$dst, (v1i64 (NEONvmovImm timm:$SIMM)))]>;
-def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst),
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd),
                          (ins nModImm:$SIMM), IIC_VMOVImm,
-                         "vmov", "i64", "$dst, $SIMM", "",
-                         [(set QPR:$dst, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
 } // isReMaterializable
 
 //   VMOV     : Vector Get Lane (move scalar to ARM core register)
 
 def VGETLNs8  : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
-                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
-                          IIC_VMOVSI, "vmov", "s8", "$dst, $src[$lane]",
-                          [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src),
-                                           imm:$lane))]>;
+                          (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "s8", "$R, $V[$lane]",
+                          [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
 def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
-                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
-                          IIC_VMOVSI, "vmov", "s16", "$dst, $src[$lane]",
-                          [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src),
-                                           imm:$lane))]>;
+                          (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "s16", "$R, $V[$lane]",
+                          [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
 def VGETLNu8  : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
-                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
-                          IIC_VMOVSI, "vmov", "u8", "$dst, $src[$lane]",
-                          [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src),
-                                           imm:$lane))]>;
+                          (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "u8", "$R, $V[$lane]",
+                          [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
 def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
-                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
-                          IIC_VMOVSI, "vmov", "u16", "$dst, $src[$lane]",
-                          [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src),
-                                           imm:$lane))]>;
+                          (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "u16", "$R, $V[$lane]",
+                          [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
 def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
-                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
-                          IIC_VMOVSI, "vmov", "32", "$dst, $src[$lane]",
-                          [(set GPR:$dst, (extractelt (v2i32 DPR:$src),
-                                           imm:$lane))]>;
+                          (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "32", "$R, $V[$lane]",
+                          [(set GPR:$R, (extractelt (v2i32 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{0};
+}
 // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
 def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
           (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
@@ -3376,37 +4250,45 @@ def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
 
 //   VMOV     : Vector Set Lane (move ARM core register to scalar)
 
-let Constraints = "$src1 = $dst" in {
-def VSETLNi8  : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$dst),
-                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
-                          IIC_VMOVISL, "vmov", "8", "$dst[$lane], $src2",
-                          [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1),
-                                           GPR:$src2, imm:$lane))]>;
-def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$dst),
-                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
-                          IIC_VMOVISL, "vmov", "16", "$dst[$lane], $src2",
-                          [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1),
-                                           GPR:$src2, imm:$lane))]>;
-def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$dst),
-                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
-                          IIC_VMOVISL, "vmov", "32", "$dst[$lane], $src2",
-                          [(set DPR:$dst, (insertelt (v2i32 DPR:$src1),
-                                           GPR:$src2, imm:$lane))]>;
+let Constraints = "$src1 = $V" in {
+def VSETLNi8  : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", "8", "$V[$lane], $R",
+                          [(set DPR:$V, (vector_insert (v8i8 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", "16", "$V[$lane], $R",
+                          [(set DPR:$V, (vector_insert (v4i16 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", "32", "$V[$lane], $R",
+                          [(set DPR:$V, (insertelt (v2i32 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21} = lane{0};
+}
 }
 def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
-          (v16i8 (INSERT_SUBREG QPR:$src1, 
+          (v16i8 (INSERT_SUBREG QPR:$src1,
                   (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
                                    (DSubReg_i8_reg imm:$lane))),
                             GPR:$src2, (SubReg_i8_lane imm:$lane))),
                   (DSubReg_i8_reg imm:$lane)))>;
 def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
-          (v8i16 (INSERT_SUBREG QPR:$src1, 
+          (v8i16 (INSERT_SUBREG QPR:$src1,
                   (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
                                      (DSubReg_i16_reg imm:$lane))),
                              GPR:$src2, (SubReg_i16_lane imm:$lane))),
                   (DSubReg_i16_reg imm:$lane)))>;
 def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
-          (v4i32 (INSERT_SUBREG QPR:$src1, 
+          (v4i32 (INSERT_SUBREG QPR:$src1,
                   (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
                                      (DSubReg_i32_reg imm:$lane))),
                              GPR:$src2, (SubReg_i32_lane imm:$lane))),
@@ -3454,13 +4336,13 @@ def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
 //   VDUP     : Vector Duplicate (from ARM core register to all elements)
 
 class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
-  : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src),
-          IIC_VMOVIS, "vdup", Dt, "$dst, $src",
-          [(set DPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
+  : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
 class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
-  : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src),
-          IIC_VMOVIS, "vdup", Dt, "$dst, $src",
-          [(set QPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
+  : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
 
 def  VDUP8d   : VDUPD<0b11101100, 0b00, "8", v8i8>;
 def  VDUP16d  : VDUPD<0b11101000, 0b01, "16", v4i16>;
@@ -3469,40 +4351,56 @@ def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
 def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
 def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
 
-def  VDUPfd   : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src),
-                      IIC_VMOVIS, "vdup", "32", "$dst, $src",
-                      [(set DPR:$dst, (v2f32 (NEONvdup
-                                              (f32 (bitconvert GPR:$src)))))]>;
-def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
-                      IIC_VMOVIS, "vdup", "32", "$dst, $src",
-                      [(set QPR:$dst, (v4f32 (NEONvdup
-                                              (f32 (bitconvert GPR:$src)))))]>;
+def  VDUPfd   : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$V), (ins GPR:$R),
+                      IIC_VMOVIS, "vdup", "32", "$V, $R",
+                      [(set DPR:$V, (v2f32 (NEONvdup
+                                              (f32 (bitconvert GPR:$R)))))]>;
+def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$V), (ins GPR:$R),
+                      IIC_VMOVIS, "vdup", "32", "$V, $R",
+                      [(set QPR:$V, (v4f32 (NEONvdup
+                                              (f32 (bitconvert GPR:$R)))))]>;
 
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
 class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
               ValueType Ty>
-  : NVDupLane<op19_16, 0, (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane),
-              IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]",
-              [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
+  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm[$lane]",
+              [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
 
 class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy>
-  : NVDupLane<op19_16, 1, (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane),
-              IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]",
-              [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src),
+  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane),
+              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm[$lane]",
+              [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
                                       imm:$lane)))]>;
 
 // Inst{19-16} is partially specified depending on the element size.
 
-def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8>;
-def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16>;
-def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32>;
-def VDUPLNfd  : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32>;
-def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8>;
-def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16>;
-def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32>;
-def VDUPLNfq  : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32>;
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8> {
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16> {
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32> {
+  let Inst{19} = lane{0};
+}
+def VDUPLNfd  : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32> {
+  let Inst{19} = lane{0};
+}
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8> {
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16> {
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32> {
+  let Inst{19} = lane{0};
+}
+def VDUPLNfq  : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32> {
+  let Inst{19} = lane{0};
+}
 
 def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
           (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
@@ -3521,18 +4419,13 @@ def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
-def  VDUPfdf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 0, 0,
-                    (outs DPR:$dst), (ins SPR:$src),
-                    IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "",
+def  VDUPfdf : PseudoNeonI<(outs DPR:$dst), (ins SPR:$src), IIC_VMOVD, "",
                     [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
-
-def  VDUPfqf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0,
-                    (outs QPR:$dst), (ins SPR:$src),
-                    IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "",
+def  VDUPfqf : PseudoNeonI<(outs QPR:$dst), (ins SPR:$src), IIC_VMOVD, "",
                     [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
 
 //   VMOVN    : Vector Narrowing Move
-defm VMOVN    : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD,
+defm VMOVN    : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
                          "vmovn", "i", trunc>;
 //   VQMOVN   : Vector Saturating Narrowing Move
 defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD,
@@ -3585,20 +4478,30 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
 def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 
+//   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
+def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f16.f32",
+                        v4i16, v4f32, int_arm_neon_vcvtfp2hf>,
+                Requires<[HasNEON, HasFP16]>;
+def  VCVTh2f  : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f32.f16",
+                        v4f32, v4i16, int_arm_neon_vcvthf2fp>,
+                Requires<[HasNEON, HasFP16]>;
+
 // Vector Reverse.
 
 //   VREV64   : Vector Reverse elements within 64-bit doublewords
 
 class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst),
-        (ins DPR:$src), IIC_VMOVD, 
-        OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (Ty (NEONvrev64 (Ty DPR:$src))))]>;
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>;
 class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst),
-        (ins QPR:$src), IIC_VMOVD, 
-        OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (Ty (NEONvrev64 (Ty QPR:$src))))]>;
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>;
 
 def VREV64d8  : VREV64D<0b00, "vrev64", "8", v8i8>;
 def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
@@ -3613,15 +4516,15 @@ def VREV64qf  : VREV64Q<0b10, "vrev64", "32", v4f32>;
 //   VREV32   : Vector Reverse elements within 32-bit words
 
 class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst),
-        (ins DPR:$src), IIC_VMOVD, 
-        OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (Ty (NEONvrev32 (Ty DPR:$src))))]>;
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>;
 class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst),
-        (ins QPR:$src), IIC_VMOVD, 
-        OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (Ty (NEONvrev32 (Ty QPR:$src))))]>;
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>;
 
 def VREV32d8  : VREV32D<0b00, "vrev32", "8", v8i8>;
 def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
@@ -3632,46 +4535,91 @@ def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>;
 //   VREV16   : Vector Reverse elements within 16-bit halfwords
 
 class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst),
-        (ins DPR:$src), IIC_VMOVD, 
-        OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (Ty (NEONvrev16 (Ty DPR:$src))))]>;
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>;
 class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst),
-        (ins QPR:$src), IIC_VMOVD, 
-        OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (Ty (NEONvrev16 (Ty QPR:$src))))]>;
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>;
 
 def VREV16d8  : VREV16D<0b00, "vrev16", "8", v8i8>;
 def VREV16q8  : VREV16Q<0b00, "vrev16", "8", v16i8>;
 
 // Other Vector Shuffles.
 
+//  Aligned extractions: really just dropping registers
+
+class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT>
+      : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))),
+             (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>;
+
+def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>;
+
+def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>;
+
+def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>;
+
+def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>;
+
+def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
+
+
 //   VEXT     : Vector Extract
 
 class VEXTd<string OpcodeStr, string Dt, ValueType Ty>
-  : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst),
-        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), NVExtFrm,
-        IIC_VEXTD, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
-        [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
-                                      (Ty DPR:$rhs), imm:$index)))]>;
+  : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$Vm, i32imm:$index), NVExtFrm,
+        IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
+                                      (Ty DPR:$Vm), imm:$index)))]> {
+  bits<4> index;
+  let Inst{11-8} = index{3-0};
+}
 
 class VEXTq<string OpcodeStr, string Dt, ValueType Ty>
-  : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst),
-        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), NVExtFrm,
-        IIC_VEXTQ, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
-        [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
-                                      (Ty QPR:$rhs), imm:$index)))]>;
-
-def VEXTd8  : VEXTd<"vext", "8",  v8i8>;
-def VEXTd16 : VEXTd<"vext", "16", v4i16>;
-def VEXTd32 : VEXTd<"vext", "32", v2i32>;
-def VEXTdf  : VEXTd<"vext", "32", v2f32>;
-
-def VEXTq8  : VEXTq<"vext", "8",  v16i8>;
-def VEXTq16 : VEXTq<"vext", "16", v8i16>;
-def VEXTq32 : VEXTq<"vext", "32", v4i32>;
-def VEXTqf  : VEXTq<"vext", "32", v4f32>;
+  : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd),
+        (ins QPR:$Vn, QPR:$Vm, i32imm:$index), NVExtFrm,
+        IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn),
+                                      (Ty QPR:$Vm), imm:$index)))]> {
+  bits<4> index;
+  let Inst{11-8} = index{3-0};
+}
+
+def VEXTd8  : VEXTd<"vext", "8",  v8i8> {
+  let Inst{11-8} = index{3-0};
+}
+def VEXTd16 : VEXTd<"vext", "16", v4i16> {
+  let Inst{11-9} = index{2-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTd32 : VEXTd<"vext", "32", v2i32> {
+  let Inst{11-10} = index{1-0};
+  let Inst{9-8}    = 0b00;
+}
+def VEXTdf  : VEXTd<"vext", "32", v2f32> {
+  let Inst{11}    = index{0};
+  let Inst{10-8}  = 0b000;
+}
+
+def VEXTq8  : VEXTq<"vext", "8",  v16i8> {
+  let Inst{11-8} = index{3-0};
+}
+def VEXTq16 : VEXTq<"vext", "16", v8i16> {
+  let Inst{11-9} = index{2-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTq32 : VEXTq<"vext", "32", v4i32> {
+  let Inst{11-10} = index{1-0};
+  let Inst{9-8}    = 0b00;
+}
+def VEXTqf  : VEXTq<"vext", "32", v4f32> {
+  let Inst{11}    = index{0};
+  let Inst{10-8}  = 0b000;
+}
 
 //   VTRN     : Vector Transpose
 
@@ -3707,160 +4655,120 @@ def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
 
 //   VTBL     : Vector Table Lookup
 def  VTBL1
-  : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTB1,
-        "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>;
+  : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
+        "vtbl", "8", "$Vd, \\{$Vn\\}, $Vm", "",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 DPR:$Vn, DPR:$Vm)))]>;
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBL2
-  : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", []>;
+  : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTB2,
+        "vtbl", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "", []>;
 def  VTBL3
-  : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", []>;
+  : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm), NVTBLFrm, IIC_VTB3,
+        "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm", "", []>;
 def  VTBL4
-  : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src),
+  : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm),
         NVTBLFrm, IIC_VTB4,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", []>;
+        "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm", "", []>;
 } // hasExtraSrcRegAllocReq = 1
 
+def  VTBL2Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QPR:$tbl, DPR:$src), IIC_VTB2, "", []>;
+def  VTBL3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>;
+def  VTBL4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>;
+
 //   VTBX     : Vector Table Extension
 def  VTBX1
-  : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTBX1,
-        "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1
-                               DPR:$orig, DPR:$tbl1, DPR:$src)))]>;
+  : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1,
+        "vtbx", "8", "$Vd, \\{$Vn\\}, $Vm", "$orig = $Vd",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1
+                               DPR:$orig, DPR:$Vn, DPR:$Vm)))]>;
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBX2
-  : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2,
-        "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", []>;
+  : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTBX2,
+        "vtbx", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "$orig = $Vd", []>;
 def  VTBX3
-  : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src),
+  : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm),
         NVTBLFrm, IIC_VTBX3,
-        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src",
-        "$orig = $dst", []>;
+        "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm",
+        "$orig = $Vd", []>;
 def  VTBX4
-  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
-        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4,
-        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src",
-        "$orig = $dst", []>;
+  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd), (ins DPR:$orig, DPR:$Vn,
+        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm), NVTBLFrm, IIC_VTBX4,
+        "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm",
+        "$orig = $Vd", []>;
 } // hasExtraSrcRegAllocReq = 1
 
+def  VTBX2Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QPR:$tbl, DPR:$src),
+                IIC_VTBX2, "$orig = $dst", []>;
+def  VTBX3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX3, "$orig = $dst", []>;
+def  VTBX4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX4, "$orig = $dst", []>;
+
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
 //===----------------------------------------------------------------------===//
 
-class N2VSPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst>
-  : NEONFPPat<(ResTy (OpNode SPR:$a)),
-              (EXTRACT_SUBREG (OpTy (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)),
-                                                       SPR:$a, ssub_0))),
-                              ssub_0)>;
+class N2VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a)),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>;
 
 class N3VSPat<SDNode OpNode, NeonI Inst>
   : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
-              (EXTRACT_SUBREG (v2f32
-                                 (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
-                                                      SPR:$a, ssub_0),
-                                       (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
-                                                      SPR:$b, ssub_0))),
-                              ssub_0)>;
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
 
 class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
   : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
-              (EXTRACT_SUBREG (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
-                                                   SPR:$acc, ssub_0),
-                                    (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
-                                                   SPR:$a, ssub_0),
-                                    (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
-                                                   SPR:$b, ssub_0)),
-                              ssub_0)>;
-
-// These need separate instructions because they must use DPR_VFP2 register
-// class which have SPR sub-registers.
-
-// Vector Add Operations used for single-precision FP
-let neverHasSideEffects = 1 in
-def VADDfd_sfp : N3VS<0,0,0b00,0b1101,0, "vadd", "f32", v2f32, v2f32, fadd, 1>;
-def : N3VSPat<fadd, VADDfd_sfp>;
-
-// Vector Sub Operations used for single-precision FP
-let neverHasSideEffects = 1 in
-def VSUBfd_sfp : N3VS<0,0,0b10,0b1101,0, "vsub", "f32", v2f32, v2f32, fsub, 0>;
-def : N3VSPat<fsub, VSUBfd_sfp>;
-
-// Vector Multiply Operations used for single-precision FP
-let neverHasSideEffects = 1 in
-def VMULfd_sfp : N3VS<1,0,0b00,0b1101,1, "vmul", "f32", v2f32, v2f32, fmul, 1>;
-def : N3VSPat<fmul, VMULfd_sfp>;
-
-// Vector Multiply-Accumulate/Subtract used for single-precision FP
-// vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so
-// we want to avoid them for now. e.g., alternating vmla/vadd instructions.
-
-//let neverHasSideEffects = 1 in
-//def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
-//                           v2f32, fmul, fadd>;
-//def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>;
-
-//let neverHasSideEffects = 1 in
-//def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
-//                           v2f32, fmul, fsub>;
-//def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>;
-
-// Vector Absolute used for single-precision FP
-let neverHasSideEffects = 1 in
-def  VABSfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01110, 0, 0,
-                      (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
-                      "vabs", "f32", "$dst, $src", "", []>;
-def : N2VSPat<fabs, f32, v2f32, VABSfd_sfp>;
-
-// Vector Negate used for single-precision FP
-let neverHasSideEffects = 1 in
-def  VNEGfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
-                      (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
-                      "vneg", "f32", "$dst, $src", "", []>;
-def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>;
-
-// Vector Maximum used for single-precision FP
-let neverHasSideEffects = 1 in
-def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
-                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND,
-                     "vmax", "f32", "$dst, $src1, $src2", "", []>;
-def : N3VSPat<NEONfmax, VMAXfd_sfp>;
-
-// Vector Minimum used for single-precision FP
-let neverHasSideEffects = 1 in
-def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
-                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND,
-                     "vmin", "f32", "$dst, $src1, $src2", "", []>;
-def : N3VSPat<NEONfmin, VMINfd_sfp>;
-
-// Vector Convert between single-precision FP and integer
-let neverHasSideEffects = 1 in
-def  VCVTf2sd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
-                         v2i32, v2f32, fp_to_sint>;
-def : N2VSPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>;
-
-let neverHasSideEffects = 1 in
-def  VCVTf2ud_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
-                         v2i32, v2f32, fp_to_uint>;
-def : N2VSPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>;
-
-let neverHasSideEffects = 1 in
-def  VCVTs2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
-                         v2f32, v2i32, sint_to_fp>;
-def : N2VSPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>;
-
-let neverHasSideEffects = 1 in
-def  VCVTu2fd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
-                         v2f32, v2i32, uint_to_fp>;
-def : N2VSPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>;
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$acc, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+def : N3VSPat<fadd, VADDfd>;
+def : N3VSPat<fsub, VSUBfd>;
+def : N3VSPat<fmul, VMULfd>;
+def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
+def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
+def : N2VSPat<fabs, VABSfd>;
+def : N2VSPat<fneg, VNEGfd>;
+def : N3VSPat<NEONfmax, VMAXfd>;
+def : N3VSPat<NEONfmin, VMINfd>;
+def : N2VSPat<arm_ftosi, VCVTf2sd>;
+def : N2VSPat<arm_ftoui, VCVTf2ud>;
+def : N2VSPat<arm_sitof, VCVTs2fd>;
+def : N2VSPat<arm_uitof, VCVTu2fd>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
index a13ff12..826ef46 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1,4 +1,4 @@
-//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===//
+//===- ARMInstrThumb.td - Thumb support for ARM ------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,7 +16,7 @@
 //
 
 def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
-                      [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
 
 def imm_neg_XFORM : SDNodeXForm<imm, [{
@@ -26,7 +26,6 @@ def imm_comp_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
 }]>;
 
-
 /// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7].
 def imm0_7 : PatLeaf<(i32 imm), [{
   return (uint32_t)N->getZExtValue() < 8;
@@ -50,9 +49,9 @@ def imm8_255_neg : PatLeaf<(i32 imm), [{
   return Val >= 8 && Val < 256;
 }], imm_neg_XFORM>;
 
-// Break imm's up into two pieces: an immediate + a left shift.
-// This uses thumb_immshifted to match and thumb_immshifted_val and
-// thumb_immshifted_shamt to get the val/shift pieces.
+// Break imm's up into two pieces: an immediate + a left shift. This uses
+// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt
+// to get the val/shift pieces.
 def thumb_immshifted : PatLeaf<(imm), [{
   return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
 }]>;
@@ -67,6 +66,11 @@ def thumb_immshifted_shamt : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(V, MVT::i32);
 }]>;
 
+// ADR instruction labels.
+def t_adrlabel : Operand<i32> {
+  let EncoderMethod = "getThumbAdrLabelOpValue";
+}
+
 // Scaled 4 immediate.
 def t_imm_s4 : Operand<i32> {
   let PrintMethod = "printThumbS4ImmOperand";
@@ -74,47 +78,114 @@ def t_imm_s4 : Operand<i32> {
 
 // Define Thumb specific addressing modes.
 
+def t_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getThumbBRTargetOpValue";
+}
+
+def t_bcctarget : Operand<i32> {
+  let EncoderMethod = "getThumbBCCTargetOpValue";
+}
+
+def t_cbtarget : Operand<i32> {
+  let EncoderMethod = "getThumbCBTargetOpValue";
+}
+
+def t_bltarget : Operand<i32> {
+  let EncoderMethod = "getThumbBLTargetOpValue";
+}
+
+def t_blxtarget : Operand<i32> {
+  let EncoderMethod = "getThumbBLXTargetOpValue";
+}
+
+def MemModeRegThumbAsmOperand : AsmOperandClass {
+  let Name = "MemModeRegThumb";
+  let SuperClasses = [];
+}
+
+def MemModeImmThumbAsmOperand : AsmOperandClass {
+  let Name = "MemModeImmThumb";
+  let SuperClasses = [];
+}
+
 // t_addrmode_rr := reg + reg
 //
 def t_addrmode_rr : Operand<i32>,
                     ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
   let PrintMethod = "printThumbAddrModeRROperand";
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
 }
 
-// t_addrmode_s4 := reg + reg
-//                  reg + imm5 * 4
+// t_addrmode_rrs := reg + reg
 //
-def t_addrmode_s4 : Operand<i32>,
-                    ComplexPattern<i32, 3, "SelectThumbAddrModeS4", []> {
-  let PrintMethod = "printThumbAddrModeS4Operand";
-  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+def t_addrmode_rrs1 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+  let ParserMatchClass = MemModeRegThumbAsmOperand;
+}
+def t_addrmode_rrs2 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+  let ParserMatchClass = MemModeRegThumbAsmOperand;
+}
+def t_addrmode_rrs4 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+  let ParserMatchClass = MemModeRegThumbAsmOperand;
+}
+
+// t_addrmode_is4 := reg + imm5 * 4
+//
+def t_addrmode_is4 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let PrintMethod = "printThumbAddrModeImm5S4Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
-// t_addrmode_s2 := reg + reg
-//                  reg + imm5 * 2
+// t_addrmode_is2 := reg + imm5 * 2
 //
-def t_addrmode_s2 : Operand<i32>,
-                    ComplexPattern<i32, 3, "SelectThumbAddrModeS2", []> {
-  let PrintMethod = "printThumbAddrModeS2Operand";
-  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+def t_addrmode_is2 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let PrintMethod = "printThumbAddrModeImm5S2Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
-// t_addrmode_s1 := reg + reg
-//                  reg + imm5
+// t_addrmode_is1 := reg + imm5
 //
-def t_addrmode_s1 : Operand<i32>,
-                    ComplexPattern<i32, 3, "SelectThumbAddrModeS1", []> {
-  let PrintMethod = "printThumbAddrModeS1Operand";
-  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+def t_addrmode_is1 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let PrintMethod = "printThumbAddrModeImm5S1Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_sp := sp + imm8 * 4
 //
 def t_addrmode_sp : Operand<i32>,
                     ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let EncoderMethod = "getAddrModeThumbSPOpValue";
   let PrintMethod = "printThumbAddrModeSPOperand";
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+  let ParserMatchClass = MemModeImmThumbAsmOperand;
+}
+
+// t_addrmode_pc := <label> => pc + imm8 * 4
+//
+def t_addrmode_pc : Operand<i32> {
+  let EncoderMethod = "getAddrModePCOpValue";
+  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 //===----------------------------------------------------------------------===//
@@ -126,132 +197,162 @@ def t_addrmode_sp : Operand<i32>,
 // these will always be in pairs, and asserts if it finds otherwise. Better way?
 let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
 def tADJCALLSTACKUP :
-PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
-           "${:comment} tADJCALLSTACKUP $amt1",
-           [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb1Only]>;
+  PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
+             [(ARMcallseq_end imm:$amt1, imm:$amt2)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
 
 def tADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
-           "${:comment} tADJCALLSTACKDOWN $amt",
-           [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>;
+  PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
+             [(ARMcallseq_start imm:$amt)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+}
+
+// T1Disassembly - A simple class to make encoding some disassembly patterns
+// easier and less verbose.
+class T1Disassembly<bits<2> op1, bits<8> op2>
+  : T1Encoding<0b101111> {
+  let Inst{9-8} = op1;
+  let Inst{7-0} = op2;
 }
 
 def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "",
                 [/* For disassembly only; pattern left blank */]>,
-           T1Encoding<0b101111> {
-  let Inst{9-8} = 0b11;
-  let Inst{7-0} = 0b00000000;
-} 
+           T1Disassembly<0b11, 0x00>; // A8.6.110
 
 def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "",
                   [/* For disassembly only; pattern left blank */]>,
-             T1Encoding<0b101111> {
-  let Inst{9-8} = 0b11;
-  let Inst{7-0} = 0b00010000;
-} 
+           T1Disassembly<0b11, 0x10>; // A8.6.410
 
 def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "",
                 [/* For disassembly only; pattern left blank */]>,
-           T1Encoding<0b101111> {
-  let Inst{9-8} = 0b11;
-  let Inst{7-0} = 0b00100000;
-} 
+           T1Disassembly<0b11, 0x20>; // A8.6.408
 
 def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "",
                 [/* For disassembly only; pattern left blank */]>,
-           T1Encoding<0b101111> {
-  let Inst{9-8} = 0b11;
-  let Inst{7-0} = 0b00110000;
-} 
+           T1Disassembly<0b11, 0x30>; // A8.6.409
 
 def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "",
                 [/* For disassembly only; pattern left blank */]>,
-           T1Encoding<0b101111> {
-  let Inst{9-8} = 0b11;
-  let Inst{7-0} = 0b01000000;
-} 
+           T1Disassembly<0b11, 0x40>; // A8.6.157
+
+// The i32imm operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val",
+                [/* For disassembly only; pattern left blank */]>,
+           T1Disassembly<0b10, {?,?,?,?,?,?,?,?}> {
+  // A8.6.22
+  bits<8> val;
+  let Inst{7-0} = val;
+}
 
 def tSETENDBE : T1I<(outs), (ins), NoItinerary, "setend\tbe",
                     [/* For disassembly only; pattern left blank */]>,
                 T1Encoding<0b101101> {
+  // A8.6.156
   let Inst{9-5} = 0b10010;
-  let Inst{3} = 1;
+  let Inst{4}   = 1;
+  let Inst{3}   = 1;            // Big-Endian
+  let Inst{2-0} = 0b000;
 }
 
 def tSETENDLE : T1I<(outs), (ins), NoItinerary, "setend\tle",
                     [/* For disassembly only; pattern left blank */]>,
                 T1Encoding<0b101101> {
+  // A8.6.156
   let Inst{9-5} = 0b10010;
-  let Inst{3} = 0;
+  let Inst{4}   = 1;
+  let Inst{3}   = 0;            // Little-Endian
+  let Inst{2-0} = 0b000;
 }
 
-// The i32imm operand $val can be used by a debugger to store more information
-// about the breakpoint.
-def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val",
+// Change Processor State is a system instruction -- for disassembly only.
+def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
+                NoItinerary, "cps$imod $iflags",
                 [/* For disassembly only; pattern left blank */]>,
-            T1Encoding<0b101111> {
-  let Inst{9-8} = 0b10;
+           T1Misc<0b0110011> {
+  // A8.6.38 & B6.1.1
+  bit imod;
+  bits<3> iflags;
+
+  let Inst{4}   = imod;
+  let Inst{3}   = 0;
+  let Inst{2-0} = iflags;
 }
 
-// Change Processor State is a system instruction -- for disassembly only.
-// The singleton $opt operand contains the following information:
-// opt{4-0} = mode ==> don't care
-// opt{5} = changemode ==> 0 (false for 16-bit Thumb instr)
-// opt{8-6} = AIF from Inst{2-0}
-// opt{10-9} = 1:imod from Inst{4} with 0b10 as enable and 0b11 as disable
-//
-// The opt{4-0} and opt{5} sub-fields are to accommodate 32-bit Thumb and ARM
-// CPS which has more options.
-def tCPS : T1I<(outs), (ins cps_opt:$opt), NoItinerary, "cps$opt",
-              [/* For disassembly only; pattern left blank */]>,
-           T1Misc<0b0110011>;
-
 // For both thumb1 and thumb2.
-let isNotDuplicable = 1 in
-def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr,
-                 "\n$cp:\n\tadd\t$dst, pc",
-                 [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
+let isNotDuplicable = 1, isCodeGenOnly = 1 in
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
+                  [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
               T1Special<{0,0,?,?}> {
-  let Inst{6-3} = 0b1111; // A8.6.6 Rm = pc
+  // A8.6.6
+  bits<3> dst;
+  let Inst{6-3} = 0b1111; // Rm = pc
+  let Inst{2-0} = dst;
 }
 
-// PC relative add.
+// PC relative add (ADR).
 def tADDrPCi : T1I<(outs tGPR:$dst), (ins t_imm_s4:$rhs), IIC_iALUi,
-                  "add\t$dst, pc, $rhs", []>,
-               T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
+                   "add\t$dst, pc, $rhs", []>,
+               T1Encoding<{1,0,1,0,0,?}> {
+  // A6.2 & A8.6.10
+  bits<3> dst;
+  bits<8> rhs;
+  let Inst{10-8} = dst;
+  let Inst{7-0}  = rhs;
+}
 
-// ADD rd, sp, #imm8
+// ADD <Rd>, sp, #<imm8>
 // This is rematerializable, which is particularly useful for taking the
 // address of locals.
-let isReMaterializable = 1 in {
+let isReMaterializable = 1 in
 def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, t_imm_s4:$rhs), IIC_iALUi,
-                  "add\t$dst, $sp, $rhs", []>,
-               T1Encoding<{1,0,1,0,1,?}>; // A6.2 & A8.6.8
+                   "add\t$dst, $sp, $rhs", []>,
+               T1Encoding<{1,0,1,0,1,?}> {
+  // A6.2 & A8.6.8
+  bits<3> dst;
+  bits<8> rhs;
+  let Inst{10-8} = dst;
+  let Inst{7-0}  = rhs;
 }
 
-// ADD sp, sp, #imm7
+// ADD sp, sp, #<imm7>
 def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
                   "add\t$dst, $rhs", []>,
-              T1Misc<{0,0,0,0,0,?,?}>; // A6.2.5 & A8.6.8
+              T1Misc<{0,0,0,0,0,?,?}> {
+  // A6.2.5 & A8.6.8
+  bits<7> rhs;
+  let Inst{6-0} = rhs;
+}
 
-// SUB sp, sp, #imm7
+// SUB sp, sp, #<imm7>
+// FIXME: The encoding and the ASM string don't match up.
 def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
                   "sub\t$dst, $rhs", []>,
-              T1Misc<{0,0,0,0,1,?,?}>; // A6.2.5 & A8.6.215
+              T1Misc<{0,0,0,0,1,?,?}> {
+  // A6.2.5 & A8.6.214
+  bits<7> rhs;
+  let Inst{6-0} = rhs;
+}
 
-// ADD rm, sp
+// ADD <Rm>, sp
 def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
                   "add\t$dst, $rhs", []>,
               T1Special<{0,0,?,?}> {
-  let Inst{6-3} = 0b1101; // A8.6.9 Encoding T1
+  // A8.6.9 Encoding T1
+  bits<4> dst;
+  let Inst{7}   = dst{3};
+  let Inst{6-3} = 0b1101;
+  let Inst{2-0} = dst{2-0};
 }
 
-// ADD sp, rm
+// ADD sp, <Rm>
 def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
                   "add\t$dst, $rhs", []>,
               T1Special<{0,0,?,?}> {
   // A8.6.9 Encoding T2
+  bits<4> dst;
   let Inst{7} = 1;
+  let Inst{6-3} = dst;
   let Inst{2-0} = 0b101;
 }
 
@@ -260,21 +361,37 @@ def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
 //
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def tBX_RET : TI<(outs), (ins), IIC_Br, "bx\tlr", [(ARMretflag)]>,
-                T1Special<{1,1,0,?}> { // A6.2.3 & A8.6.25
+  def tBX_RET : TI<(outs), (ins), IIC_Br, "bx\tlr",
+                   [(ARMretflag)]>,
+                T1Special<{1,1,0,?}> {
+    // A6.2.3 & A8.6.25
     let Inst{6-3} = 0b1110; // Rm = lr
+    let Inst{2-0} = 0b000;
   }
+
   // Alternative return instruction used by vararg functions.
-  def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx\t$target",[]>,
-                       T1Special<{1,1,0,?}>; // A6.2.3 & A8.6.25
+  def tBX_RET_vararg : TI<(outs), (ins tGPR:$Rm),
+                          IIC_Br, "bx\t$Rm",
+                          []>,
+                       T1Special<{1,1,0,?}> {
+    // A6.2.3 & A8.6.25
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b000;
+  }
 }
 
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
-  def tBRIND : TI<(outs), (ins GPR:$dst), IIC_Br, "mov\tpc, $dst",
-                  [(brind GPR:$dst)]>,
-               T1Special<{1,0,1,?}> {
-    // <Rd> = Inst{7:2-0} = pc
+  def tBRIND : TI<(outs), (ins GPR:$Rm),
+                  IIC_Br,
+                  "mov\tpc, $Rm",
+                  [(brind GPR:$Rm)]>,
+               T1Special<{1,0,?,?}> {
+    // A8.6.97
+    bits<4> Rm;
+    let Inst{7}   = 1;          // <Rd> = Inst{7:2-0} = pc
+    let Inst{6-3} = Rm;
     let Inst{2-0} = 0b111;
   }
 }
@@ -282,28 +399,52 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 // FIXME: remove when we have a way to marking a MI with these properties.
 let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
     hasExtraDefRegAllocReq = 1 in
-def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_Br,
-                   "pop${p}\t$dsts", []>,
-               T1Misc<{1,1,0,?,?,?,?}>;
+def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                   IIC_iPop_Br,
+                   "pop${p}\t$regs", []>,
+               T1Misc<{1,1,0,?,?,?,?}> {
+  // A8.6.121
+  bits<16> regs;
+  let Inst{8}   = regs{15};     // registers = P:'0000000':register_list
+  let Inst{7-0} = regs{7-0};
+}
 
+// All calls clobber the non-callee saved registers. SP is marked as a use to
+// prevent stack-pointer assignments that appear immediately before calls from
+// potentially appearing dead.
 let isCall = 1,
+  // On non-Darwin platforms R9 is callee-saved.
   Defs = [R0,  R1,  R2,  R3,  R12, LR,
           D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
           D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins i32imm:$func, variable_ops), IIC_Br,
-                  "bl\t${func:call}",
+                  (outs), (ins t_bltarget:$func, variable_ops), IIC_Br,
+                  "bl\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
-             Requires<[IsThumb, IsNotDarwin]>;
+             Requires<[IsThumb, IsNotDarwin]> {
+    bits<21> func;
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = 1;
+    let Inst{11} = 1;
+    let Inst{10-0} = func{10-0};
+  }
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                   (outs), (ins i32imm:$func, variable_ops), IIC_Br,
-                   "blx\t${func:call}",
+                   (outs), (ins t_blxtarget:$func, variable_ops), IIC_Br,
+                   "blx\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+              Requires<[IsThumb, HasV5T, IsNotDarwin]> {
+    bits<21> func;
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = 1;
+    let Inst{11} = 1;
+    let Inst{10-1} = func{10-1};
+    let Inst{0} = 0; // func{0} is assumed zero
+  }
 
   // Also used for Thumb2
   def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
@@ -313,642 +454,1002 @@ let isCall = 1,
               T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24;
 
   // ARMv4T
+  // FIXME: Should be a pseudo.
+  let isCodeGenOnly = 1 in
   def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?,
                   (outs), (ins tGPR:$func, variable_ops), IIC_Br,
                   "mov\tlr, pc\n\tbx\t$func",
                   [(ARMcall_nolink tGPR:$func)]>,
-            Requires<[IsThumb1Only, IsNotDarwin]>;
+            Requires<[IsThumb, IsThumb1Only, IsNotDarwin]>;
 }
 
-// On Darwin R9 is call-clobbered.
 let isCall = 1,
+  // On Darwin R9 is call-clobbered.
+  // R7 is marked as a use to prevent frame-pointer assignments from being
+  // moved above / below calls.
   Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
           D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
           D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Uses = [R7, SP] in {
   // Also used for Thumb2
   def tBLr9 : TIx2<0b11110, 0b11, 1,
-                   (outs), (ins i32imm:$func, variable_ops), IIC_Br,
-                   "bl\t${func:call}",
+                   (outs), (ins pred:$p, t_bltarget:$func, variable_ops),
+                   IIC_Br, "bl${p}\t$func",
                    [(ARMtcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, IsDarwin]>;
+              Requires<[IsThumb, IsDarwin]> {
+    bits<21> func;
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = 1;
+    let Inst{11} = 1;
+    let Inst{10-0} = func{10-0};
+  }
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi_r9 : TIx2<0b11110, 0b11, 0,
-                      (outs), (ins i32imm:$func, variable_ops), IIC_Br,
-                      "blx\t${func:call}",
+                      (outs), (ins pred:$p, t_blxtarget:$func, variable_ops),
+                      IIC_Br, "blx${p}\t$func",
                       [(ARMcall tglobaladdr:$func)]>,
-                 Requires<[IsThumb, HasV5T, IsDarwin]>;
+                 Requires<[IsThumb, HasV5T, IsDarwin]> {
+    bits<21> func;
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = 1;
+    let Inst{11} = 1;
+    let Inst{10-1} = func{10-1};
+    let Inst{0} = 0; // func{0} is assumed zero
+  }
 
   // Also used for Thumb2
-  def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
-                    "blx\t$func",
+  def tBLXr_r9 : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
+                    "blx${p}\t$func",
                     [(ARMtcall GPR:$func)]>,
                  Requires<[IsThumb, HasV5T, IsDarwin]>,
-                 T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24
+                 T1Special<{1,1,1,?}> {
+    // A6.2.3 & A8.6.24
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b000;
+  }
 
   // ARMv4T
+  let isCodeGenOnly = 1 in
+  // FIXME: Should be a pseudo.
   def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?,
                    (outs), (ins tGPR:$func, variable_ops), IIC_Br,
                    "mov\tlr, pc\n\tbx\t$func",
                    [(ARMcall_nolink tGPR:$func)]>,
-              Requires<[IsThumb1Only, IsDarwin]>;
+              Requires<[IsThumb, IsThumb1Only, IsDarwin]>;
 }
 
-let isBranch = 1, isTerminator = 1 in {
-  let isBarrier = 1 in {
-    let isPredicable = 1 in
-    def tB   : T1I<(outs), (ins brtarget:$target), IIC_Br,
-                   "b\t$target", [(br bb:$target)]>,
-               T1Encoding<{1,1,1,0,0,?}>;
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  let isPredicable = 1 in
+  def tB   : T1I<(outs), (ins t_brtarget:$target), IIC_Br,
+                 "b\t$target", [(br bb:$target)]>,
+             T1Encoding<{1,1,1,0,0,?}> {
+    bits<11> target;
+    let Inst{10-0} = target;
+  }
 
   // Far jump
+  // Just a pseudo for a tBL instruction. Needed to let regalloc know about
+  // the clobber of LR.
   let Defs = [LR] in
-  def tBfar : TIx2<0b11110, 0b11, 1, (outs), (ins brtarget:$target), IIC_Br,
-                    "bl\t$target\t${:comment} far jump",[]>;
-
-  def tBR_JTr : T1JTI<(outs),
-                      (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id),
-                      IIC_Br, "mov\tpc, $target\n\t.align\t2$jt",
-                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>,
-                Encoding16 {
-    let Inst{15-7} = 0b010001101;
-    let Inst{2-0} = 0b111;
-  }
+  def tBfar : tPseudoInst<(outs), (ins t_bltarget:$target),
+                          Size4Bytes, IIC_Br, []>;
+
+  def tBR_JTr : tPseudoInst<(outs),
+                      (ins tGPR:$target, i32imm:$jt, i32imm:$id),
+                      SizeSpecial, IIC_Br,
+                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]> {
+    list<Predicate> Predicates = [IsThumb, IsThumb1Only];
   }
 }
 
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let isBranch = 1, isTerminator = 1 in
-  def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), IIC_Br,
-                 "b$cc\t$target",
+  def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
+                 "b${p}\t$target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
-             T1Encoding<{1,1,0,1,?,?}>;
+             T1Encoding<{1,1,0,1,?,?}> {
+  bits<4> p;
+  bits<8> target;
+  let Inst{11-8} = p;
+  let Inst{7-0} = target;
+}
 
 // Compare and branch on zero / non-zero
 let isBranch = 1, isTerminator = 1 in {
-  def tCBZ  : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br,
-                  "cbz\t$cmp, $target", []>,
-              T1Misc<{0,0,?,1,?,?,?}>;
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbz\t$Rn, $target", []>,
+              T1Misc<{0,0,?,1,?,?,?}> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
 
-  def tCBNZ : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br,
+  def tCBNZ : T1I<(outs), (ins tGPR:$cmp, t_cbtarget:$target), IIC_Br,
                   "cbnz\t$cmp, $target", []>,
-              T1Misc<{1,0,?,1,?,?,?}>;
+              T1Misc<{1,0,?,1,?,?,?}> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
 }
 
 // A8.6.218 Supervisor Call (Software Interrupt) -- for disassembly only
 // A8.6.16 B: Encoding T1
 // If Inst{11-8} == 0b1111 then SEE SVC
-let isCall = 1 in {
-def tSVC : T1pI<(outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc", []>,
-           Encoding16 {
+let isCall = 1, Uses = [SP] in
+def tSVC : T1pI<(outs), (ins i32imm:$imm), IIC_Br,
+                "svc", "\t$imm", []>, Encoding16 {
+  bits<8> imm;
   let Inst{15-12} = 0b1101;
-  let Inst{11-8} = 0b1111;
-}
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-0}   = imm;
 }
 
-// A8.6.16 B: Encoding T1
-// If Inst{11-8} == 0b1110 then UNDEFINED
-// FIXME: Temporary emitted as raw bytes until this pseudo-op will be added to
-// binutils
+// The assembler uses 0xDEFE for a trap instruction.
 let isBarrier = 1, isTerminator = 1 in
 def tTRAP : TI<(outs), (ins), IIC_Br, 
-               ".short 0xdefe ${:comment} trap", [(trap)]>, Encoding16 {
-  let Inst{15-12} = 0b1101;
-  let Inst{11-8} = 0b1110;
+               "trap", [(trap)]>, Encoding16 {
+  let Inst = 0xdefe;
 }
 
 //===----------------------------------------------------------------------===//
 //  Load Store Instructions.
 //
 
+// Loads: reg/reg and reg/imm5
 let canFoldAsLoad = 1, isReMaterializable = 1 in
-def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr,
-               "ldr", "\t$dst, $addr",
-               [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>,
-           T1LdSt<0b100>;
-def tLDRi: T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr,
-               "ldr", "\t$dst, $addr",
-               []>,
-           T1LdSt4Imm<{1,?,?}>;
-
-def tLDRB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr,
-                "ldrb", "\t$dst, $addr",
-                [(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>,
-            T1LdSt<0b110>;
-def tLDRBi: T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr,
-                "ldrb", "\t$dst, $addr",
-                []>,
-            T1LdSt1Imm<{1,?,?}>;
-
-def tLDRH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr,
-                "ldrh", "\t$dst, $addr",
-                [(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>,
-            T1LdSt<0b101>;
-def tLDRHi: T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr,
-                "ldrh", "\t$dst, $addr",
-                []>,
-            T1LdSt2Imm<{1,?,?}>;
+multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs tGPR:$Rt), (ins AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 1 /* Load */,
+                      (outs tGPR:$Rt), (ins AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
+}
+// Stores: reg/reg and reg/imm5
+multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 0 /* Store */,
+                      (outs), (ins tGPR:$Rt, AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
+}
+
+// A8.6.57 & A8.6.60
+defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iLoad_r, IIC_iLoad_i, "ldr",
+                                UnOpFrag<(load node:$Src)>>;
+
+// A8.6.64 & A8.6.61
+defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
+                                UnOpFrag<(zextloadi8 node:$Src)>>;
+
+// A8.6.76 & A8.6.73
+defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2,
+                                t_addrmode_is2, AddrModeT1_2,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
+                                UnOpFrag<(zextloadi16 node:$Src)>>;
 
 let AddedComplexity = 10 in
-def tLDRSB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+def tLDRSB :                    // A8.6.80
+  T1pILdStEncode<0b011, (outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_1, IIC_iLoad_bh_r,
                  "ldrsb", "\t$dst, $addr",
-                 [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>,
-             T1LdSt<0b011>;
+                 [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
 
 let AddedComplexity = 10 in
-def tLDRSH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+def tLDRSH :                    // A8.6.84
+  T1pILdStEncode<0b111, (outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_2, IIC_iLoad_bh_r,
                  "ldrsh", "\t$dst, $addr",
-                 [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>,
-             T1LdSt<0b111>;
+                 [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
 
 let canFoldAsLoad = 1 in
-def tLDRspi : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
-                  "ldr", "\t$dst, $addr",
-                  [(set tGPR:$dst, (load t_addrmode_sp:$addr))]>,
-              T1LdStSP<{1,?,?}>;
+def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+                    "ldr", "\t$Rt, $addr",
+                    [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
+              T1LdStSP<{1,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
 
 // Special instruction for restore. It cannot clobber condition register
 // when it's expanded by eliminateCallFramePseudoInstr().
 let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1 in
-def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
-                    "ldr", "\t$dst, $addr", []>,
-               T1LdStSP<{1,?,?}>;
+// FIXME: Pseudo for tLDRspi
+def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+                     "ldr", "\t$dst, $addr", []>,
+               T1LdStSP<{1,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
 
 // Load tconstpool
 // FIXME: Use ldr.n to work around a Darwin assembler bug.
 let canFoldAsLoad = 1, isReMaterializable = 1 in
-def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
-                  "ldr", ".n\t$dst, $addr",
-                  [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>,
-              T1Encoding<{0,1,0,0,1,?}>; // A6.2 & A8.6.59
-
-// Special LDR for loads from non-pc-relative constpools.
-let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1,
-    isReMaterializable = 1 in
-def tLDRcp  : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
-                  "ldr", "\t$dst, $addr", []>,
-              T1LdStSP<{1,?,?}>;
-
-def tSTR : T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer,
-               "str", "\t$src, $addr",
-               [(store tGPR:$src, t_addrmode_s4:$addr)]>,
-           T1LdSt<0b000>;
-def tSTRi: T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer,
-               "str", "\t$src, $addr",
-               []>,
-           T1LdSt4Imm<{0,?,?}>;
-
-def tSTRB : T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer,
-                 "strb", "\t$src, $addr",
-                 [(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>,
-            T1LdSt<0b010>;
-def tSTRBi: T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer,
-                 "strb", "\t$src, $addr",
-                 []>,
-            T1LdSt1Imm<{0,?,?}>;
-
-def tSTRH : T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer,
-                 "strh", "\t$src, $addr",
-                 [(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>,
-            T1LdSt<0b001>;
-def tSTRHi: T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer,
-                 "strh", "\t$src, $addr",
-                 []>,
-            T1LdSt2Imm<{0,?,?}>;
-
-def tSTRspi : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
-                   "str", "\t$src, $addr",
-                   [(store tGPR:$src, t_addrmode_sp:$addr)]>,
-              T1LdStSP<{0,?,?}>;
-
-let mayStore = 1, neverHasSideEffects = 1 in {
-// Special instruction for spill. It cannot clobber condition register
-// when it's expanded by eliminateCallFramePseudoInstr().
-def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
+def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                  "ldr", ".n\t$Rt, $addr",
+                  [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
+              T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
+// A8.6.194 & A8.6.192
+defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iStore_r, IIC_iStore_i, "str",
+                                BinOpFrag<(store node:$LHS, node:$RHS)>>;
+
+// A8.6.197 & A8.6.195
+defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
+                                BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+
+// A8.6.207 & A8.6.205
+defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2,
+                                t_addrmode_is2, AddrModeT1_2,
+                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
+                                BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+
+
+def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
+                    "str", "\t$Rt, $addr",
+                    [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
+              T1LdStSP<{0,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
+let mayStore = 1, neverHasSideEffects = 1 in
+// Special instruction for spill. It cannot clobber condition register when it's
+// expanded by eliminateCallFramePseudoInstr().
+// FIXME: Pseudo for tSTRspi
+def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStore_i,
                   "str", "\t$src, $addr", []>,
-             T1LdStSP<{0,?,?}>;
+             T1LdStSP<{0,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
 }
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
 
-// These requires base address to be written back or one of the loaded regs.
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
-def tLDM : T1I<(outs),
-               (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops),
-               IIC_iLoadm,
-               "ldm${addr:submode}${p}\t$addr, $dsts", []>,
-           T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53
-
-def tLDM_UPD : T1It<(outs tGPR:$wb),
-                    (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops),
-                    IIC_iLoadm,
-                    "ldm${addr:submode}${p}\t$addr!, $dsts",
-                    "$addr.addr = $wb", []>,
-               T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53
-} // mayLoad, neverHasSideEffects = 1, hasExtraDefRegAllocReq
-
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
-def tSTM_UPD : T1It<(outs tGPR:$wb),
-                    (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops),
-                    IIC_iStorem,
-                    "stm${addr:submode}${p}\t$addr!, $srcs",
-                    "$addr.addr = $wb", []>,
-           T1Encoding<{1,1,0,0,0,?}>; // A6.2 & A8.6.189
+multiclass thumb_ldst_mult<string asm, InstrItinClass itin,
+                           InstrItinClass itin_upd, bits<6> T1Enc,
+                           bit L_bit> {
+  def IA :
+    T1I<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+        itin, !strconcat(asm, "ia${p}\t$Rn, $regs"), []>,
+       T1Encoding<T1Enc> {
+    bits<3> Rn;
+    bits<8> regs;
+    let Inst{10-8} = Rn;
+    let Inst{7-0}  = regs;
+  }
+  def IA_UPD :
+    T1It<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin_upd, !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []>,
+        T1Encoding<T1Enc> {
+    bits<3> Rn;
+    bits<8> regs;
+    let Inst{10-8} = Rn;
+    let Inst{7-0}  = regs;
+  }
+}
+
+// These require base address to be written back or one of the loaded regs.
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm tLDM : thumb_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu,
+                            {1,1,0,0,1,?}, 1>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm tSTM : thumb_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu,
+                            {1,1,0,0,0,?}, 0>;
+ 
+} // neverHasSideEffects
 
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
-def tPOP : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_Br,
-               "pop${p}\t$dsts", []>,
-           T1Misc<{1,1,0,?,?,?,?}>;
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+               IIC_iPop,
+               "pop${p}\t$regs", []>,
+           T1Misc<{1,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{15};
+  let Inst{7-0} = regs{7-0};
+}
 
 let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
-def tPUSH : T1I<(outs), (ins pred:$p, reglist:$srcs, variable_ops), IIC_Br,
-                "push${p}\t$srcs", []>,
-            T1Misc<{0,1,0,?,?,?,?}>;
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                IIC_iStore_m,
+                "push${p}\t$regs", []>,
+            T1Misc<{0,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{14};
+  let Inst{7-0} = regs{7-0};
+}
 
 //===----------------------------------------------------------------------===//
 //  Arithmetic Instructions.
 //
 
+// Helper classes for encoding T1pI patterns:
+class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rn;
+}
+class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin,
+                     string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1Misc<opA> {
+  bits<3> Rm;
+  bits<3> Rd;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sI patterns:
+class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  bits<3> Rd;
+  let Inst{8-6} = Rm;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                       string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rd;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sIt patterns:
+class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rdn;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rdn;
+}
+class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                        string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rdn;
+  bits<8> imm8;
+  let Inst{10-8} = Rdn;
+  let Inst{7-0}  = imm8;
+}
+
 // Add with carry register
 let isCommutable = 1, Uses = [CPSR] in
-def tADC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
-                 "adc", "\t$dst, $rhs",
-                 [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>,
-           T1DataProcessing<0b0101>;
+def tADC :                      // A8.6.2
+  T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
+                "adc", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>;
 
 // Add immediate
-def tADDi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
-                   "add", "\t$dst, $lhs, $rhs",
-                   [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>,
-             T1General<0b01110>;
+def tADDi3 :                    // A8.6.4 T1
+  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3), IIC_iALUi,
+                   "add", "\t$Rd, $Rm, $imm3",
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> {
+  bits<3> imm3;
+  let Inst{8-6} = imm3;
+}
 
-def tADDi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
-                   "add", "\t$dst, $rhs",
-                   [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>,
-             T1General<{1,1,0,?,?}>;
+def tADDi8 :                    // A8.6.4 T2
+  T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8),
+                    IIC_iALUi,
+                    "add", "\t$Rdn, $imm8",
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>;
 
 // Add register
 let isCommutable = 1 in
-def tADDrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
-                   "add", "\t$dst, $lhs, $rhs",
-                   [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>,
-             T1General<0b01100>;
+def tADDrr :                    // A8.6.6 T1
+  T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "add", "\t$Rd, $Rn, $Rm",
+                [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>;
 
 let neverHasSideEffects = 1 in
-def tADDhirr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
-                     "add", "\t$dst, $rhs", []>,
-               T1Special<{0,0,?,?}>;
+def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
+                     "add", "\t$Rdn, $Rm", []>,
+               T1Special<{0,0,?,?}> {
+  // A8.6.6 T2
+  bits<4> Rdn;
+  bits<4> Rm;
+  let Inst{7}   = Rdn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rdn{2-0};
+}
 
-// And register
+// AND register
 let isCommutable = 1 in
-def tAND : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
-                 "and", "\t$dst, $rhs",
-                 [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>,
-           T1DataProcessing<0b0000>;
+def tAND :                      // A8.6.12
+  T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "and", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>;
 
 // ASR immediate
-def tASRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
-                  "asr", "\t$dst, $lhs, $rhs",
-                  [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>,
-             T1General<{0,1,0,?,?}>;
+def tASRri :                    // A8.6.14
+  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+                   IIC_iMOVsi,
+                   "asr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm:$imm5)))]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
 
 // ASR register
-def tASRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
-                   "asr", "\t$dst, $rhs",
-                   [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>,
-             T1DataProcessing<0b0100>;
+def tASRrr :                    // A8.6.15
+  T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "asr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>;
 
 // BIC register
-def tBIC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
-                 "bic", "\t$dst, $rhs",
-                 [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>,
-           T1DataProcessing<0b1110>;
+def tBIC :                      // A8.6.20
+  T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "bic", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>;
 
 // CMN register
-let Defs = [CPSR] in {
+let isCompare = 1, Defs = [CPSR] in {
 //FIXME: Disable CMN, as CCodes are backwards from compare expectations
 //       Compare-to-zero still works out, just not the relationals
-//def tCMN : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
-//                "cmn", "\t$lhs, $rhs",
-//                [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>,
-//           T1DataProcessing<0b1011>;
-def tCMNz : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
-                 "cmn", "\t$lhs, $rhs",
-                 [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>,
-            T1DataProcessing<0b1011>;
-}
+//def tCMN :                     // A8.6.33
+//  T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs),
+//               IIC_iCMPr,
+//               "cmn", "\t$lhs, $rhs",
+//               [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+
+def tCMNz :                     // A8.6.33
+  T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmn", "\t$Rn, $Rm",
+               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>;
+
+} // isCompare = 1, Defs = [CPSR]
 
 // CMP immediate
-let Defs = [CPSR] in {
-def tCMPi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
-                  "cmp", "\t$lhs, $rhs",
-                  [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>,
-             T1General<{1,0,1,?,?}>;
-def tCMPzi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
-                  "cmp", "\t$lhs, $rhs",
-                  [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>,
-              T1General<{1,0,1,?,?}>;
+let isCompare = 1, Defs = [CPSR] in {
+def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, i32imm:$imm8), IIC_iCMPi,
+                  "cmp", "\t$Rn, $imm8",
+                  [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
+             T1General<{1,0,1,?,?}> {
+  // A8.6.35
+  bits<3> Rn;
+  bits<8> imm8;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = imm8;
 }
 
 // CMP register
-let Defs = [CPSR] in {
-def tCMPr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
-                 "cmp", "\t$lhs, $rhs",
-                 [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>,
-            T1DataProcessing<0b1010>;
-def tCMPzr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
-                  "cmp", "\t$lhs, $rhs",
-                  [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>,
-             T1DataProcessing<0b1010>;
-
-def tCMPhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
-                   "cmp", "\t$lhs, $rhs", []>,
-              T1Special<{0,1,?,?}>;
-def tCMPzhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
-                    "cmp", "\t$lhs, $rhs", []>,
-               T1Special<{0,1,?,?}>;
+def tCMPr :                     // A8.6.36 T1
+  T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmp", "\t$Rn, $Rm",
+               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
+                   "cmp", "\t$Rn, $Rm", []>,
+              T1Special<{0,1,?,?}> {
+  // A8.6.36 T2
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{7}   = Rn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rn{2-0};
 }
+} // isCompare = 1, Defs = [CPSR]
 
 
 // XOR register
 let isCommutable = 1 in
-def tEOR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
-                 "eor", "\t$dst, $rhs",
-                 [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>,
-           T1DataProcessing<0b0001>;
+def tEOR :                      // A8.6.45
+  T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "eor", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>;
 
 // LSL immediate
-def tLSLri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
-                  "lsl", "\t$dst, $lhs, $rhs",
-                  [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>,
-             T1General<{0,0,0,?,?}>;
+def tLSLri :                    // A8.6.88
+  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+                   IIC_iMOVsi,
+                   "lsl", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
 
 // LSL register
-def tLSLrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
-                   "lsl", "\t$dst, $rhs",
-                   [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>,
-             T1DataProcessing<0b0010>;
+def tLSLrr :                    // A8.6.89
+  T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsl", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>;
 
 // LSR immediate
-def tLSRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
-                  "lsr", "\t$dst, $lhs, $rhs",
-                  [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>,
-             T1General<{0,0,1,?,?}>;
+def tLSRri :                    // A8.6.90
+  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+                   IIC_iMOVsi,
+                   "lsr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm:$imm5)))]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
 
 // LSR register
-def tLSRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
-                   "lsr", "\t$dst, $rhs",
-                   [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>,
-             T1DataProcessing<0b0011>;
-
-// move register
-def tMOVi8 : T1sI<(outs tGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
-                  "mov", "\t$dst, $src",
-                  [(set tGPR:$dst, imm0_255:$src)]>,
-             T1General<{1,0,0,?,?}>;
+def tLSRrr :                    // A8.6.91
+  T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>;
+
+// Move register
+let isMoveImm = 1 in
+def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins i32imm:$imm8), IIC_iMOVi,
+                  "mov", "\t$Rd, $imm8",
+                  [(set tGPR:$Rd, imm0_255:$imm8)]>,
+             T1General<{1,0,0,?,?}> {
+  // A8.6.96
+  bits<3> Rd;
+  bits<8> imm8;
+  let Inst{10-8} = Rd;
+  let Inst{7-0}  = imm8;
+}
 
 // TODO: A7-73: MOV(2) - mov setting flag.
 
-
 let neverHasSideEffects = 1 in {
 // FIXME: Make this predicable.
-def tMOVr       : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
-                      "mov\t$dst, $src", []>,
-                  T1Special<0b1000>;
+def tMOVr       : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
+                      "mov\t$Rd, $Rm", []>,
+                  T1Special<0b1000> {
+  // A8.6.97
+  bits<4> Rd;
+  bits<4> Rm;
+  // Bits {7-6} are encoded by the T1Special value.
+  let Inst{5-3} = Rm{2-0};
+  let Inst{2-0} = Rd{2-0};
+}
 let Defs = [CPSR] in
-def tMOVSr      : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
-                       "movs\t$dst, $src", []>, Encoding16 {
+def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
+                      "movs\t$Rd, $Rm", []>, Encoding16 {
+  // A8.6.97
+  bits<3> Rd;
+  bits<3> Rm;
   let Inst{15-6} = 0b0000000000;
+  let Inst{5-3}  = Rm;
+  let Inst{2-0}  = Rd;
 }
 
 // FIXME: Make these predicable.
-def tMOVgpr2tgpr : T1I<(outs tGPR:$dst), (ins GPR:$src), IIC_iMOVr,
-                       "mov\t$dst, $src", []>,
-                   T1Special<{1,0,0,?}>;
-def tMOVtgpr2gpr : T1I<(outs GPR:$dst), (ins tGPR:$src), IIC_iMOVr,
-                       "mov\t$dst, $src", []>,
-                   T1Special<{1,0,?,0}>;
-def tMOVgpr2gpr  : T1I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
-                       "mov\t$dst, $src", []>,
-                   T1Special<{1,0,?,?}>;
+def tMOVgpr2tgpr : T1I<(outs tGPR:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+                       "mov\t$Rd, $Rm", []>,
+                   T1Special<{1,0,0,?}> {
+  // A8.6.97
+  bits<4> Rd;
+  bits<4> Rm;
+  // Bit {7} is encoded by the T1Special value.
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rd{2-0};
+}
+def tMOVtgpr2gpr : T1I<(outs GPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
+                       "mov\t$Rd, $Rm", []>,
+                   T1Special<{1,0,?,0}> {
+  // A8.6.97
+  bits<4> Rd;
+  bits<4> Rm;
+  // Bit {6} is encoded by the T1Special value.
+  let Inst{7}   = Rd{3};
+  let Inst{5-3} = Rm{2-0};
+  let Inst{2-0} = Rd{2-0};
+}
+def tMOVgpr2gpr  : T1I<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+                       "mov\t$Rd, $Rm", []>,
+                   T1Special<{1,0,?,?}> {
+  // A8.6.97
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{7}   = Rd{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rd{2-0};
+}
 } // neverHasSideEffects
 
-// multiply register
+// Multiply register
 let isCommutable = 1 in
-def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMUL32,
-                 "mul", "\t$dst, $rhs, $dst", /* A8.6.105 MUL Encoding T1 */
-                 [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>,
-           T1DataProcessing<0b1101>;
-
-// move inverse register
-def tMVN : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
-                "mvn", "\t$dst, $src",
-                [(set tGPR:$dst, (not tGPR:$src))]>,
-           T1DataProcessing<0b1111>;
-
-// bitwise or register
+def tMUL :                      // A8.6.105 T1
+  T1sItDPEncode<0b1101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMUL32,
+                "mul", "\t$Rdn, $Rm, $Rdn",
+                [(set tGPR:$Rdn, (mul tGPR:$Rn, tGPR:$Rm))]>;
+
+// Move inverse register
+def tMVN :                      // A8.6.107
+  T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
+               "mvn", "\t$Rd, $Rn",
+               [(set tGPR:$Rd, (not tGPR:$Rn))]>;
+
+// Bitwise or register
 let isCommutable = 1 in
-def tORR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),  IIC_iALUr,
-                 "orr", "\t$dst, $rhs",
-                 [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>,
-           T1DataProcessing<0b1100>;
-
-// swaps
-def tREV : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
-                "rev", "\t$dst, $src",
-                [(set tGPR:$dst, (bswap tGPR:$src))]>,
-                Requires<[IsThumb1Only, HasV6]>,
-           T1Misc<{1,0,1,0,0,0,?}>;
-
-def tREV16 : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
-                  "rev16", "\t$dst, $src",
-             [(set tGPR:$dst,
-                   (or (and (srl tGPR:$src, (i32 8)), 0xFF),
-                       (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
-                           (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
-                               (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
-                Requires<[IsThumb1Only, HasV6]>,
-             T1Misc<{1,0,1,0,0,1,?}>;
-
-def tREVSH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
-                  "revsh", "\t$dst, $src",
-                  [(set tGPR:$dst,
-                        (sext_inreg
-                          (or (srl (and tGPR:$src, 0xFF00), (i32 8)),
-                              (shl tGPR:$src, (i32 8))), i16))]>,
-                  Requires<[IsThumb1Only, HasV6]>,
-             T1Misc<{1,0,1,0,1,1,?}>;
-
-// rotate right register
-def tROR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
-                 "ror", "\t$dst, $rhs",
-                 [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>,
-           T1DataProcessing<0b0111>;
-
-// negate register
-def tRSB : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALUi,
-                "rsb", "\t$dst, $src, #0",
-                [(set tGPR:$dst, (ineg tGPR:$src))]>,
-           T1DataProcessing<0b1001>;
+def tORR :                      // A8.6.114
+  T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "orr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>;
+
+// Swaps
+def tREV :                      // A8.6.134
+  T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def tREV16 :                    // A8.6.135
+  T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev16", "\t$Rd, $Rm",
+             [(set tGPR:$Rd,
+                   (or (and (srl tGPR:$Rm, (i32 8)), 0xFF),
+                       (or (and (shl tGPR:$Rm, (i32 8)), 0xFF00),
+                           (or (and (srl tGPR:$Rm, (i32 8)), 0xFF0000),
+                               (and (shl tGPR:$Rm, (i32 8)), 0xFF000000)))))]>,
+                Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def tREVSH :                    // A8.6.136
+  T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "revsh", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd,
+                       (sext_inreg
+                         (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)),
+                             (shl tGPR:$Rm, (i32 8))), i16))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Rotate right register
+def tROR :                      // A8.6.139
+  T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "ror", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>;
+
+// Negate register
+def tRSB :                      // A8.6.141
+  T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
+               IIC_iALUi,
+               "rsb", "\t$Rd, $Rn, #0",
+               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>;
 
 // Subtract with carry register
 let Uses = [CPSR] in
-def tSBC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
-                 "sbc", "\t$dst, $rhs",
-                 [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>,
-           T1DataProcessing<0b0110>;
+def tSBC :                      // A8.6.151
+  T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sbc", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>;
 
 // Subtract immediate
-def tSUBi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
-                  "sub", "\t$dst, $lhs, $rhs",
-                  [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>,
-             T1General<0b01111>;
-
-def tSUBi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
-                   "sub", "\t$dst, $rhs",
-                   [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>,
-             T1General<{1,1,1,?,?}>;
-
-// subtract register
-def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
-                  "sub", "\t$dst, $lhs, $rhs",
-                  [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>,
-             T1General<0b01101>;
+def tSUBi3 :                    // A8.6.210 T1
+  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3),
+                   IIC_iALUi,
+                   "sub", "\t$Rd, $Rm, $imm3",
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> {
+  bits<3> imm3;
+  let Inst{8-6} = imm3;
+}
 
-// TODO: A7-96: STMIA - store multiple.
+def tSUBi8 :                    // A8.6.210 T2
+  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8),
+                    IIC_iALUi,
+                    "sub", "\t$Rdn, $imm8",
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>;
+
+// Subtract register
+def tSUBrr :                    // A8.6.212
+  T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sub", "\t$Rd, $Rn, $Rm",
+                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>;
 
-// sign-extend byte
-def tSXTB  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
-                  "sxtb", "\t$dst, $src",
-                  [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>,
-                  Requires<[IsThumb1Only, HasV6]>,
-             T1Misc<{0,0,1,0,0,1,?}>;
-
-// sign-extend short
-def tSXTH  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
-                  "sxth", "\t$dst, $src",
-                  [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>,
-                  Requires<[IsThumb1Only, HasV6]>,
-             T1Misc<{0,0,1,0,0,0,?}>;
-
-// test
-let isCommutable = 1, Defs = [CPSR] in
-def tTST  : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
-                 "tst", "\t$lhs, $rhs",
-                 [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>,
-            T1DataProcessing<0b1000>;
-
-// zero-extend byte
-def tUXTB  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
-                  "uxtb", "\t$dst, $src",
-                  [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>,
-                  Requires<[IsThumb1Only, HasV6]>,
-             T1Misc<{0,0,1,0,1,1,?}>;
-
-// zero-extend short
-def tUXTH  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
-                  "uxth", "\t$dst, $src",
-                  [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>,
-                  Requires<[IsThumb1Only, HasV6]>,
-             T1Misc<{0,0,1,0,1,0,?}>;
+// TODO: A7-96: STMIA - store multiple.
 
+// Sign-extend byte
+def tSXTB :                     // A8.6.222
+  T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Sign-extend short
+def tSXTH :                     // A8.6.224
+  T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Test
+let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
+def tTST :                      // A8.6.230
+  T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
+               "tst", "\t$Rn, $Rm",
+               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>;
+
+// Zero-extend byte
+def tUXTB :                     // A8.6.262
+  T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Zero-extend short
+def tUXTH :                     // A8.6.264
+  T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
 
 // Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
 // Expanded after instruction selection into a branch sequence.
 let usesCustomInserter = 1 in  // Expanded after instruction selection.
   def tMOVCCr_pseudo :
   PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
-              NoItinerary, "${:comment} tMOVCCr $cc",
+              NoItinerary,
              [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
 
 
 // 16-bit movcc in IT blocks for Thumb2.
 let neverHasSideEffects = 1 in {
-def tMOVCCr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iCMOVr,
-                    "mov", "\t$dst, $rhs", []>,
-              T1Special<{1,0,?,?}>;
+def tMOVCCr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iCMOVr,
+                    "mov", "\t$Rdn, $Rm", []>,
+              T1Special<{1,0,?,?}> {
+  bits<4> Rdn;
+  bits<4> Rm;
+  let Inst{7}   = Rdn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rdn{2-0};
+}
+
+let isMoveImm = 1 in
+def tMOVCCi : T1pIt<(outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$Rm), IIC_iCMOVi,
+                    "mov", "\t$Rdn, $Rm", []>,
+              T1General<{1,0,0,?,?}> {
+  bits<3> Rdn;
+  bits<8> Rm;
+  let Inst{10-8} = Rdn;
+  let Inst{7-0}  = Rm;
+}
 
-def tMOVCCi : T1pIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMOVi,
-                    "mov", "\t$dst, $rhs", []>,
-              T1General<{1,0,0,?,?}>;
 } // neverHasSideEffects
 
 // tLEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
-let neverHasSideEffects = 1 in {
-let isReMaterializable = 1 in
-def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
-                    "adr$p\t$dst, #$label", []>,
-                T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
 
-} // neverHasSideEffects
-def tLEApcrelJT : T1I<(outs tGPR:$dst),
-                      (ins i32imm:$label, nohash_imm:$id, pred:$p),
-                      IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []>,
-                  T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
+def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
+               IIC_iALUi, "adr{$p}\t$Rd, #$addr", []>,
+               T1Encoding<{1,0,1,0,0,?}> {
+  bits<3> Rd;
+  bits<8> addr;
+  let Inst{10-8} = Rd;
+  let Inst{7-0} = addr;
+}
+
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
+                              Size2Bytes, IIC_iALUi, []>;
+
+def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
+                              (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                              Size2Bytes, IIC_iALUi, []>;
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register -- for disassembly only
+//
+
+class tMovRCopro<string opc, bit direction>
+  : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
+                       GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+          [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-24} = 0b1110;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>;
+def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>;
+
+class tMovRRCopro<string opc, bit direction>
+  : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
+          [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-24} = 0b1100;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>;
+def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
+
+//===----------------------------------------------------------------------===//
+// Other Coprocessor Instructions.  For disassembly only.
+//
+def tCDP : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
+                 c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+                 "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
 //
 
 // __aeabi_read_tp preserves the registers r1-r3.
-let isCall = 1,
-  Defs = [R0, LR] in {
-  def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br,
-                     "bl\t__aeabi_read_tp",
-                     [(set R0, ARMthread_pointer)]>;
+let isCall = 1, Defs = [R0, LR], Uses = [SP] in
+def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br,
+                   "bl\t__aeabi_read_tp",
+                   [(set R0, ARMthread_pointer)]> {
+  // Encoding is 0xf7fffffe.
+  let Inst = 0xf7fffffe;
 }
 
+//===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
-//   eh_sjlj_setjmp() is an instruction sequence to store the return
-//   address and save #0 in R0 for the non-longjmp case.
-//   Since by its nature we may be coming from some other function to get
-//   here, and we're using the stack frame for the containing function to
-//   save/restore registers, we can't keep anything live in regs across
-//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
-//   when we get here from a longjmp(). We force everthing out of registers
-//   except for our own input by listing the relevant registers in Defs. By
-//   doing so, we also cause the prologue/epilogue code to actively preserve
-//   all of the callee-saved resgisters, which is exactly what we want.
-//   $val is a scratch register for our use.
-let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ], hasSideEffects = 1,
-   isBarrier = 1  in {
-  def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
-                              AddrModeNone, SizeSpecial, NoItinerary,
-                              "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
-                              "adds\t$val, #7\n\t"
-                              "str\t$val, [$src, #4]\n\t"
-                              "movs\tr0, #0\n\t"
-                              "b\t1f\n\t"
-                              "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
-                              "1:", "",
-                   [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
-}
+// 
+
+// eh_sjlj_setjmp() is an instruction sequence to store the return address and
+// save #0 in R0 for the non-longjmp case.  Since by its nature we may be coming
+// from some other function to get here, and we're using the stack frame for the
+// containing function to save/restore registers, we can't keep anything live in
+// regs across the eh_sjlj_setjmp(), else it will almost certainly have been
+// tromped upon when we get here from a longjmp(). We force everthing out of
+// registers except for our own input by listing the relevant registers in
+// Defs. By doing so, we also cause the prologue/epilogue code to actively
+// preserve all of the callee-saved resgisters, which is exactly what we want.
+// $val is a scratch register for our use.
+let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ],
+    hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in
+def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
+                                  AddrModeNone, SizeSpecial, NoItinerary, "","",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
 
 // FIXME: Non-Darwin version(s)
-let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
-    Defs = [ R7, LR, SP ] in {
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+    Defs = [ R7, LR, SP ] in
 def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
-                             AddrModeNone, SizeSpecial, IndexModeNone,
-                             Pseudo, NoItinerary,
-                             "ldr\t$scratch, [$src, #8]\n\t"
-                             "mov\tsp, $scratch\n\t"
-                             "ldr\t$scratch, [$src, #4]\n\t"
-                             "ldr\tr7, [$src]\n\t"
-                             "bx\t$scratch", "",
-                         [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
-                                Requires<[IsThumb, IsDarwin]>;
-}
+                              AddrModeNone, SizeSpecial, IndexModeNone,
+                              Pseudo, NoItinerary, "", "",
+                              [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                             Requires<[IsThumb, IsDarwin]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
 
+// Comparisons
+def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
+            (tCMPi8  tGPR:$Rn, imm0_255:$imm8)>;
+def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
+            (tCMPr   tGPR:$Rn, tGPR:$Rm)>;
+
 // Add with carry
 def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
             (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
@@ -991,27 +1492,42 @@ def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>,
       Requires<[IsThumb, HasV5T, IsDarwin]>;
 
 // zextload i1 -> zextload i8
-def : T1Pat<(zextloadi1 t_addrmode_s1:$addr),
-            (tLDRB t_addrmode_s1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr),
+            (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
+            (tLDRBi t_addrmode_is1:$addr)>;
 
 // extload -> zextload
-def : T1Pat<(extloadi1  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
-def : T1Pat<(extloadi8  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
-def : T1Pat<(extloadi16 t_addrmode_s2:$addr),  (tLDRH t_addrmode_s2:$addr)>;
+def : T1Pat<(extloadi1  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(extloadi1  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_is2:$addr),  (tLDRHi t_addrmode_is2:$addr)>;
 
 // If it's impossible to use [r,r] address mode for sextload, select to
 // ldr{b|h} + sxt{b|h} instead.
-def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
-            (tSXTB (tLDRB t_addrmode_s1:$addr))>,
-      Requires<[IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi16 t_addrmode_s2:$addr),
-            (tSXTH (tLDRH t_addrmode_s2:$addr))>,
-      Requires<[IsThumb1Only, HasV6]>;
-
-def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
-            (tASRri (tLSLri (tLDRB t_addrmode_s1:$addr), 24), 24)>;
-def : T1Pat<(sextloadi16 t_addrmode_s1:$addr),
-            (tASRri (tLSLri (tLDRH t_addrmode_s1:$addr), 16), 16)>;
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tSXTB (tLDRBi t_addrmode_is1:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
+            (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tSXTH (tLDRHi t_addrmode_is2:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
+            (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
+            (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
+            (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
 
 // Large immediate handling.
 
@@ -1028,8 +1544,7 @@ def : T1Pat<(i32 imm0_255_comp:$src),
 // scheduling.
 let isReMaterializable = 1 in
 def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
-                   NoItinerary,
-                   "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
+                             NoItinerary,
                [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
                                            imm:$cp))]>,
-               Requires<[IsThumb1Only]>;
+               Requires<[IsThumb, IsThumb1Only]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 6ba0a44..0e01be5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -21,16 +21,12 @@ def it_mask : Operand<i32> {
   let PrintMethod = "printThumbITMask";
 }
 
-// Table branch address
-def tb_addrmode : Operand<i32> {
-  let PrintMethod = "printTBAddrMode";
-}
-
 // Shifted operands. No register controlled shifts for Thumb2.
 // Note: We do not support rrx shifted operands yet.
 def t2_so_reg : Operand<i32>,    // reg imm
                 ComplexPattern<i32, 2, "SelectT2ShifterOperandReg",
                                [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getT2SORegOpValue";
   let PrintMethod = "printT2SOOperand";
   let MIOperandInfo = (ops rGPR, i32imm);
 }
@@ -47,11 +43,10 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
 
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
-// immediate splatted into multiple bytes of the word. t2_so_imm values are
-// represented in the imm field in the same 12-bit form that they are encoded
-// into t2_so_imm instructions: the 8-bit immediate is the least significant
-// bits [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11].
-def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]>;
+// immediate splatted into multiple bytes of the word.
+def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]> {
+  let EncoderMethod = "getT2SOImmOpValue";
+}
 
 // t2_so_imm_not - Match an immediate that is a complement
 // of a t2_so_imm.
@@ -63,7 +58,7 @@ def t2_so_imm_not : Operand<i32>,
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg : Operand<i32>,
                     PatLeaf<(imm), [{
-  return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
+  return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1;
 }], t2_so_imm_neg_XFORM>;
 
 // Break t2_so_imm's up into two pieces.  This handles immediates with up to 16
@@ -128,27 +123,41 @@ def imm0_255_not : PatLeaf<(i32 imm), [{
 // t2addrmode_imm12  := reg + imm12
 def t2addrmode_imm12 : Operand<i32>,
                        ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
-  let PrintMethod = "printT2AddrModeImm12Operand";
+  let PrintMethod = "printAddrModeImm12Operand";
+  let EncoderMethod = "getAddrModeImm12OpValue";
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+  let ParserMatchClass = MemMode5AsmOperand;
 }
 
+// ADR instruction labels.
+def t2adrlabel : Operand<i32> {
+  let EncoderMethod = "getT2AdrLabelOpValue";
+}
+
+
 // t2addrmode_imm8  := reg +/- imm8
 def t2addrmode_imm8 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
   let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 def t2am_imm8_offset : Operand<i32>,
-                       ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset", []>{
+                       ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset",
+                                      [], [SDNPWantRoot]> {
   let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+  let EncoderMethod = "getT2AddrModeImm8OffsetOpValue";
+  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 // t2addrmode_imm8s4  := reg +/- (imm8 << 2)
-def t2addrmode_imm8s4 : Operand<i32>,
-                        ComplexPattern<i32, 2, "SelectT2AddrModeImm8s4", []> {
+def t2addrmode_imm8s4 : Operand<i32> {
   let PrintMethod = "printT2AddrModeImm8s4Operand";
+  let EncoderMethod = "getT2AddrModeImm8s4OpValue";
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 def t2am_imm8s4_offset : Operand<i32> {
@@ -159,7 +168,9 @@ def t2am_imm8s4_offset : Operand<i32> {
 def t2addrmode_so_reg : Operand<i32>,
                         ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
   let PrintMethod = "printT2AddrModeSoRegOperand";
+  let EncoderMethod = "getT2AddrModeSORegOpValue";
   let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
+  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 
@@ -167,45 +178,294 @@ def t2addrmode_so_reg : Operand<i32>,
 // Multiclass helpers...
 //
 
+
+class T2OneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{19-16}  = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2TwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sTwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+
+class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2FourReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
+
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+
 /// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
 /// unary operation that produces a value. These are predicable and can be
 /// changed to modify CPSR.
-multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
-                      bit Cheap = 0, bit ReMat = 0> {
+multiclass T2I_un_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                      PatFrag opnode, bit Cheap = 0, bit ReMat = 0> {
    // shifted imm
-   def i : T2sI<(outs rGPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
-                opc, "\t$dst, $src",
-                [(set rGPR:$dst, (opnode t2_so_imm:$src))]> {
+   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
+                opc, "\t$Rd, $imm",
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
      let isAsCheapAsAMove = Cheap;
      let isReMaterializable = ReMat;
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
      let Inst{19-16} = 0b1111; // Rn
      let Inst{15} = 0;
    }
    // register
-   def r : T2sI<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVr,
-                opc, ".w\t$dst, $src",
-                [(set rGPR:$dst, (opnode rGPR:$src))]> {
+   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
+                opc, ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
      let Inst{19-16} = 0b1111; // Rn
      let Inst{14-12} = 0b000; // imm3
      let Inst{7-6} = 0b00; // imm2
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def s : T2sI<(outs rGPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi,
-                opc, ".w\t$dst, $src",
-                [(set rGPR:$dst, (opnode t2_so_reg:$src))]> {
+   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rd, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
      let Inst{19-16} = 0b1111; // Rn
    }
 }
@@ -213,94 +473,97 @@ multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
 /// binary operation that produces a value. These are predicable and can be
 /// changed to modify CPSR.
-multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
-                       bit Commutable = 0, string wide = ""> {
+multiclass T2I_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode, bit Commutable = 0, string wide = ""> {
    // shifted imm
-   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
-                 opc, "\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]> {
+   def ri : T2sTwoRegImm<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
+                 opc, "\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr,
-                 opc, !strconcat(wide, "\t$dst, $lhs, $rhs"),
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]> {
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
      let Inst{14-12} = 0b000; // imm3
      let Inst{7-6} = 0b00; // imm2
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
-                 opc, !strconcat(wide, "\t$dst, $lhs, $rhs"),
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]> {
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
    }
 }
 
 /// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
 //  the ".w" prefix to indicate that they are wide.
-multiclass T2I_bin_w_irs<bits<4> opcod, string opc, PatFrag opnode,
-                         bit Commutable = 0> :
-    T2I_bin_irs<opcod, opc, opnode, Commutable, ".w">;
+multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                         PatFrag opnode, bit Commutable = 0> :
+    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w">;
 
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
 /// reversed.  The 'rr' form is only defined for the disassembler; for codegen
 /// it is equivalent to the T2I_bin_irs counterpart.
 multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
-                 opc, ".w\t$dst, $rhs, $lhs",
-                 [(set rGPR:$dst, (opnode t2_so_imm:$lhs, rGPR:$rhs))]> {
+   def ri : T2sTwoRegImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+                 opc, ".w\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, rGPR:$lhs), IIC_iALUr,
-                 opc, "\t$dst, $rhs, $lhs",
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, "\t$Rd, $Rn, $Rm",
                  [/* For disassembly only; pattern left blank */]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
      let Inst{14-12} = 0b000; // imm3
      let Inst{7-6} = 0b00; // imm2
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
-                 opc, "\t$dst, $rhs, $lhs",
-                 [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> {
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm",
+                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = ?; // The S bit.
    }
 }
 
 /// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
 /// instruction modifies the CPSR register.
-let Defs = [CPSR] in {
-multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
-                         bit Commutable = 0> {
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+multiclass T2I_bin_s_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                         PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2I<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
-                !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
-                [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+   def ri : T2TwoRegImm<
+                (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), iii,
+                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm",
+                [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -308,9 +571,10 @@ multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def rr : T2I<(outs rGPR:$dst), (ins GPR:$lhs, rGPR:$rhs), IIC_iALUr,
-                !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
-                [(set rGPR:$dst, (opnode GPR:$lhs, rGPR:$rhs))]> {
+   def rr : T2ThreeReg<
+                (outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), iir,
+                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $Rm",
+                [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -321,9 +585,10 @@ multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2I<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
-                !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
-                [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+   def rs : T2TwoRegShiftedReg<
+                (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -340,51 +605,58 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    // The register-immediate version is re-materializable. This is useful
    // in particular for taking the address of a local.
    let isReMaterializable = 1 in {
-   def ri : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+   def ri : T2sTwoRegImm<
+                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+                 opc, ".w\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24} = 1;
      let Inst{23-21} = op23_21;
-     let Inst{20} = 0; // The S bit.
      let Inst{15} = 0;
    }
    }
    // 12-bit imm
-   def ri12 : T2I<(outs rGPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi,
-                  !strconcat(opc, "w"), "\t$dst, $lhs, $rhs",
-                  [(set rGPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]> {
+   def ri12 : T2I<
+                  (outs rGPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+                  !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
+                  [(set rGPR:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
+     bits<4> Rd;
+     bits<4> Rn;
+     bits<12> imm;
      let Inst{31-27} = 0b11110;
-     let Inst{25} = 1;
-     let Inst{24} = 0;
+     let Inst{26} = imm{11};
+     let Inst{25-24} = 0b10;
      let Inst{23-21} = op23_21;
      let Inst{20} = 0; // The S bit.
+     let Inst{19-16} = Rn;
      let Inst{15} = 0;
+     let Inst{14-12} = imm{10-8};
+     let Inst{11-8} = Rd;
+     let Inst{7-0} = imm{7-0};
    }
    // register
-   def rr : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, rGPR:$rhs), IIC_iALUr,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode GPR:$lhs, rGPR:$rhs))]> {
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24} = 1;
      let Inst{23-21} = op23_21;
-     let Inst{20} = 0; // The S bit.
      let Inst{14-12} = 0b000; // imm3
      let Inst{7-6} = 0b00; // imm2
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+                 [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24} = 1;
      let Inst{23-21} = op23_21;
-     let Inst{20} = 0; // The S bit.
    }
 }
 
@@ -395,50 +667,49 @@ let Uses = [CPSR] in {
 multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
    // shifted imm
-   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
-                 opc, "\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]>,
+   def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
+                 IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
-     let Inst{20} = 0; // The S bit.
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]>,
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
                  Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = 0; // The S bit.
      let Inst{14-12} = 0b000; // imm3
      let Inst{7-6} = 0b00; // imm2
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]>,
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
-     let Inst{20} = 0; // The S bit.
    }
 }
 
 // Carry setting variants
-let Defs = [CPSR] in {
+let isCodeGenOnly = 1, Defs = [CPSR] in {
 multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
                                bit Commutable = 0> {
    // shifted imm
-   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
-                 opc, "\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]>,
+   def ri : T2sTwoRegImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+                 opc, "\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
@@ -447,9 +718,9 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]>,
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
                  Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
@@ -461,9 +732,10 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]>,
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -476,12 +748,13 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 
 /// T2I_rbin_s_is - Same as T2I_rbin_irs except sets 's' bit and the register
 /// version is not needed since this is only for codegen.
-let Defs = [CPSR] in {
+let isCodeGenOnly = 1, Defs = [CPSR] in {
 multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2I<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
-                !strconcat(opc, "s"), ".w\t$dst, $rhs, $lhs",
-                [(set rGPR:$dst, (opnode t2_so_imm:$lhs, rGPR:$rhs))]> {
+   def ri : T2TwoRegImm<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm",
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -489,9 +762,10 @@ multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
      let Inst{15} = 0;
    }
    // shifted register
-   def rs : T2I<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
-                !strconcat(opc, "s"), "\t$dst, $rhs, $lhs",
-                [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> {
+   def rs : T2TwoRegShiftedReg<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                IIC_iALUsi, !strconcat(opc, "s"), "\t$Rd, $Rn, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -504,18 +778,20 @@ multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
 //  rotate operation that produces a value.
 multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
    // 5-bit imm
-   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, imm1_31:$rhs))]> {
+   def ri : T2sTwoRegShiftImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$imm), IIC_iMOVsi,
+                 opc, ".w\t$Rd, $Rm, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, imm1_31:$imm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-21} = 0b010010;
      let Inst{19-16} = 0b1111; // Rn
      let Inst{5-4} = opcod;
    }
    // register
-   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iMOVsr,
-                 opc, ".w\t$dst, $lhs, $rhs",
-                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]> {
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-21} = opcod;
@@ -528,11 +804,14 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
 /// patterns. Similar to T2I_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
 let isCompare = 1, Defs = [CPSR] in {
-multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
+multiclass T2I_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode> {
    // shifted imm
-   def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi,
-                opc, ".w\t$lhs, $rhs",
-                [(opnode GPR:$lhs, t2_so_imm:$rhs)]> {
+   def ri : T2OneRegCmpImm<
+                (outs), (ins GPR:$Rn, t2_so_imm:$imm), iii,
+                opc, ".w\t$Rn, $imm",
+                [(opnode GPR:$Rn, t2_so_imm:$imm)]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -541,7 +820,8 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
      let Inst{11-8} = 0b1111; // Rd
    }
    // register
-   def rr : T2I<(outs), (ins GPR:$lhs, rGPR:$rhs), IIC_iCMPr,
+   def rr : T2TwoRegCmp<
+                (outs), (ins GPR:$lhs, rGPR:$rhs), iir,
                 opc, ".w\t$lhs, $rhs",
                 [(opnode GPR:$lhs, rGPR:$rhs)]> {
      let Inst{31-27} = 0b11101;
@@ -554,9 +834,10 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iCMPsi,
-                opc, ".w\t$lhs, $rhs",
-                [(opnode GPR:$lhs, t2_so_reg:$rhs)]> {
+   def rs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rn, $ShiftedRm",
+                [(opnode GPR:$Rn, t2_so_reg:$ShiftedRm)]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -567,20 +848,29 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
 }
 
 /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
-multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> {
-  def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), IIC_iLoadi,
-                   opc, ".w\t$dst, $addr",
-                   [(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]> {
+multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
+  def i12 : T2Ii12<(outs GPR:$Rt), (ins t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set GPR:$Rt, (opnode t2addrmode_imm12:$addr))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
     let Inst{23} = 1;
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<17> addr;
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{23}    = addr{12};    // U
+    let Inst{11-0}  = addr{11-0};  // imm
   }
-  def i8  : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi,
-                   opc, "\t$dst, $addr",
-                   [(set GPR:$dst, (opnode t2addrmode_imm8:$addr))]> {
+  def i8  : T2Ii8 <(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(set GPR:$Rt, (opnode t2addrmode_imm8:$addr))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
@@ -591,10 +881,18 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> {
     // Offset: index==TRUE, wback==FALSE
     let Inst{10} = 1; // The P bit.
     let Inst{8} = 0; // The W bit.
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{9}     = addr{8};    // U
+    let Inst{7-0}   = addr{7-0};  // imm
   }
-  def s   : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), IIC_iLoadr,
-                   opc, ".w\t$dst, $addr",
-                   [(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]> {
+  def s   : T2Iso <(outs GPR:$Rt), (ins t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(set GPR:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
@@ -602,10 +900,20 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> {
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
     let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
   }
-  def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
-                   opc, ".w\t$dst, $addr",
-                   [(set GPR:$dst, (opnode (ARMWrapper tconstpool:$addr)))]> {
+
+  // FIXME: Is the pci variant actually needed?
+  def pci : T2Ipc <(outs GPR:$Rt), (ins i32imm:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set GPR:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
     let isReMaterializable = 1;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
@@ -614,22 +922,35 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> {
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
     let Inst{19-16} = 0b1111; // Rn
+    bits<4> Rt;
+    bits<12> addr;
+    let Inst{15-12} = Rt{3-0};
+    let Inst{11-0}  = addr{11-0};
   }
 }
 
 /// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
-multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> {
-  def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), IIC_iStorei,
-                   opc, ".w\t$src, $addr",
-                   [(opnode GPR:$src, t2addrmode_imm12:$addr)]> {
+multiclass T2I_st<bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
+  def i12 : T2Ii12<(outs), (ins GPR:$Rt, t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode GPR:$Rt, t2addrmode_imm12:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0001;
     let Inst{22-21} = opcod;
     let Inst{20} = 0; // !load
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<17> addr;
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{23}    = addr{12};    // U
+    let Inst{11-0}  = addr{11-0};  // imm
   }
-  def i8  : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), IIC_iStorei,
-                   opc, "\t$src, $addr",
-                   [(opnode GPR:$src, t2addrmode_imm8:$addr)]> {
+  def i8  : T2Ii8 <(outs), (ins GPR:$Rt, t2addrmode_imm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(opnode GPR:$Rt, t2addrmode_imm8:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -638,24 +959,40 @@ multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> {
     // Offset: index==TRUE, wback==FALSE
     let Inst{10} = 1; // The P bit.
     let Inst{8} = 0; // The W bit.
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{9}     = addr{8};    // U
+    let Inst{7-0}   = addr{7-0};  // imm
   }
-  def s   : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), IIC_iStorer,
-                   opc, ".w\t$src, $addr",
-                   [(opnode GPR:$src, t2addrmode_so_reg:$addr)]> {
+  def s   : T2Iso <(outs), (ins GPR:$Rt, t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode GPR:$Rt, t2addrmode_so_reg:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
     let Inst{20} = 0; // !load
     let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16}   = addr{9-6}; // Rn
+    let Inst{3-0} = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
   }
 }
 
-/// T2I_unary_rrot - A unary operation with two forms: one whose operand is a
+/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                  opc, ".w\t$dst, $src",
-                 [(set rGPR:$dst, (opnode rGPR:$src))]> {
+multiclass T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode> {
+  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
+                  opc, ".w\t$Rd, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -664,25 +1001,27 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi,
-                  opc, ".w\t$dst, $src, ror $rot",
-                 [(set rGPR:$dst, (opnode (rotr rGPR:$src, rot_imm:$rot)))]> {
+  def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+                  opc, ".w\t$Rd, $Rm, ror $rot",
+                 [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
      let Inst{19-16} = 0b1111; // Rn
      let Inst{15-12} = 0b1111;
      let Inst{7} = 1;
-     let Inst{5-4} = {?,?}; // rotate
+
+     bits<2> rot;
+     let Inst{5-4} = rot{1-0}; // rotate
    }
 }
 
 // UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
-multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                  opc, "\t$dst, $src",
-                 [(set rGPR:$dst, (opnode rGPR:$src))]>,
-                 Requires<[HasT2ExtractPack]> {
+multiclass T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
+  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
+                  opc, "\t$Rd, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm))]>,
+                 Requires<[HasT2ExtractPack, IsThumb2]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -691,25 +1030,27 @@ multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi,
-                  opc, "\t$dst, $src, ror $rot",
-                 [(set rGPR:$dst, (opnode (rotr rGPR:$src, rot_imm:$rot)))]>,
-                 Requires<[HasT2ExtractPack]> {
+  def r_rot : T2TwoReg<(outs rGPR:$dst), (ins rGPR:$Rm, rot_imm:$rot),
+                  IIC_iEXTr, opc, "\t$dst, $Rm, ror $rot",
+                 [(set rGPR:$dst, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+                 Requires<[HasT2ExtractPack, IsThumb2]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
      let Inst{19-16} = 0b1111; // Rn
      let Inst{15-12} = 0b1111;
      let Inst{7} = 1;
-     let Inst{5-4} = {?,?}; // rotate
+
+     bits<2> rot;
+     let Inst{5-4} = rot{1-0}; // rotate
    }
 }
 
 // SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
 // supported yet.
-multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> {
-  def r     : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                  opc, "\t$dst, $src", []> {
+multiclass T2I_ext_rrot_sxtb16<bits<3> opcod, string opc> {
+  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
+                  opc, "\t$Rd, $Rm", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -718,25 +1059,27 @@ multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi,
-                  opc, "\t$dst, $src, ror $rot", []> {
+  def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$rot), IIC_iEXTr,
+                  opc, "\t$Rd, $Rm, ror $rot", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
      let Inst{19-16} = 0b1111; // Rn
      let Inst{15-12} = 0b1111;
      let Inst{7} = 1;
-     let Inst{5-4} = {?,?}; // rotate
+
+      bits<2> rot;
+      let Inst{5-4} = rot{1-0}; // rotate
    }
 }
 
-/// T2I_bin_rrot - A binary operation with two forms: one whose operand is a
+/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def rr     : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS), IIC_iALUr,
-                  opc, "\t$dst, $LHS, $RHS",
-                  [(set rGPR:$dst, (opnode rGPR:$LHS, rGPR:$RHS))]>,
-                  Requires<[HasT2ExtractPack]> {
+multiclass T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode> {
+  def rr     : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr,
+                  opc, "\t$Rd, $Rn, $Rm",
+                  [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -744,25 +1087,28 @@ multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def rr_rot : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS, i32imm:$rot),
-                  IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot",
-                  [(set rGPR:$dst, (opnode rGPR:$LHS,
-                                          (rotr rGPR:$RHS, rot_imm:$rot)))]>,
-                  Requires<[HasT2ExtractPack]> {
+  def rr_rot : T2ThreeReg<(outs rGPR:$Rd),
+                  (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
+                  IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
+                  [(set rGPR:$Rd, (opnode rGPR:$Rn,
+                                          (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
      let Inst{15-12} = 0b1111;
      let Inst{7} = 1;
-     let Inst{5-4} = {?,?}; // rotate
+
+     bits<2> rot;
+     let Inst{5-4} = rot{1-0}; // rotate
    }
 }
 
 // DO variant - disassembly only, no pattern
 
-multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> {
-  def rr     : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS), IIC_iALUr,
-                  opc, "\t$dst, $LHS, $RHS", []> {
+multiclass T2I_exta_rrot_DO<bits<3> opcod, string opc> {
+  def rr     : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr,
+                  opc, "\t$Rd, $Rn, $Rm", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -770,14 +1116,16 @@ multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def rr_rot : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS, i32imm:$rot),
-                  IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot", []> {
+  def rr_rot : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot),
+                  IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
      let Inst{15-12} = 0b1111;
      let Inst{7} = 1;
-     let Inst{5-4} = {?,?}; // rotate
+
+     bits<2> rot;
+     let Inst{5-4} = rot{1-0}; // rotate
    }
 }
 
@@ -789,24 +1137,23 @@ multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> {
 //  Miscellaneous Instructions.
 //
 
+class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<12> label;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = label{11};
+  let Inst{14-12} = label{10-8};
+  let Inst{7-0}   = label{7-0};
+}
+
 // LEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
-let neverHasSideEffects = 1 in {
-let isReMaterializable = 1 in
-def t2LEApcrel : T2XI<(outs rGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
-                      "adr${p}.w\t$dst, #$label", []> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-24} = 0b10;
-  // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
-  let Inst{22} = 0;
-  let Inst{20} = 0;
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{15} = 0;
-}
-} // neverHasSideEffects
-def t2LEApcrelJT : T2XI<(outs rGPR:$dst),
-                        (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi,
-                        "adr${p}.w\t$dst, #${label}_${id}", []> {
+def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
+              (ins t2adrlabel:$addr, pred:$p),
+              IIC_iALUi, "adr{$p}.w\t$Rd, #$addr", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-24} = 0b10;
   // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
@@ -814,76 +1161,88 @@ def t2LEApcrelJT : T2XI<(outs rGPR:$dst),
   let Inst{20} = 0;
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
-}
 
+  bits<4> Rd;
+  bits<13> addr;
+  let Inst{11-8} = Rd;
+  let Inst{23}    = addr{12};
+  let Inst{21}    = addr{12};
+  let Inst{26}    = addr{11};
+  let Inst{14-12} = addr{10-8};
+  let Inst{7-0}   = addr{7-0};
+}
+
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
+                                Size4Bytes, IIC_iALUi, []>;
+def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                                Size4Bytes, IIC_iALUi,
+                                []>;
+
+
+// FIXME: None of these add/sub SP special instructions should be necessary
+// at all for thumb2 since they use the same encodings as the generic
+// add/sub instructions. In thumb1 we need them since they have dedicated
+// encodings. At the least, they should be pseudo instructions.
 // ADD r, sp, {so_imm|i12}
-def t2ADDrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
-                        IIC_iALUi, "add", ".w\t$dst, $sp, $imm", []> {
+let isCodeGenOnly = 1 in {
+def t2ADDrSPi   : T2sTwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm),
+                        IIC_iALUi, "add", ".w\t$Rd, $Rn, $imm", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b1000;
-  let Inst{20} = ?; // The S bit.
-  let Inst{19-16} = 0b1101; // Rn = sp
   let Inst{15} = 0;
 }
-def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
-                       IIC_iALUi, "addw", "\t$dst, $sp, $imm", []> {
+def t2ADDrSPi12 : T2TwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm),
+                       IIC_iALUi, "addw", "\t$Rd, $Rn, $imm", []> {
   let Inst{31-27} = 0b11110;
-  let Inst{25} = 1;
-  let Inst{24-21} = 0b0000;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{25-20} = 0b100000;
   let Inst{15} = 0;
 }
 
 // ADD r, sp, so_reg
-def t2ADDrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
-                        IIC_iALUsi, "add", ".w\t$dst, $sp, $rhs", []> {
+def t2ADDrSPs   : T2sTwoRegShiftedReg<
+                        (outs GPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm),
+                        IIC_iALUsi, "add", ".w\t$Rd, $Rn, $ShiftedRm", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b1000;
-  let Inst{20} = ?; // The S bit.
-  let Inst{19-16} = 0b1101; // Rn = sp
   let Inst{15} = 0;
 }
 
 // SUB r, sp, {so_imm|i12}
-def t2SUBrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
-                        IIC_iALUi, "sub", ".w\t$dst, $sp, $imm", []> {
+def t2SUBrSPi   : T2sTwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm),
+                        IIC_iALUi, "sub", ".w\t$Rd, $Rn, $imm", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b1101;
-  let Inst{20} = ?; // The S bit.
-  let Inst{19-16} = 0b1101; // Rn = sp
   let Inst{15} = 0;
 }
-def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
-                       IIC_iALUi, "subw", "\t$dst, $sp, $imm", []> {
+def t2SUBrSPi12 : T2TwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm),
+                       IIC_iALUi, "subw", "\t$Rd, $Rn, $imm", []> {
   let Inst{31-27} = 0b11110;
-  let Inst{25} = 1;
-  let Inst{24-21} = 0b0101;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{25-20} = 0b101010;
   let Inst{15} = 0;
 }
 
 // SUB r, sp, so_reg
-def t2SUBrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+def t2SUBrSPs   : T2sTwoRegImm<(outs GPR:$Rd), (ins GPR:$Rn, t2_so_reg:$imm),
                        IIC_iALUsi,
-                       "sub", "\t$dst, $sp, $rhs", []> {
+                       "sub", "\t$Rd, $Rn, $imm", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b1101;
-  let Inst{20} = ?; // The S bit.
   let Inst{19-16} = 0b1101; // Rn = sp
   let Inst{15} = 0;
 }
+} // end isCodeGenOnly = 1
 
 // Signed and unsigned division on v7-M
-def t2SDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, 
-                 "sdiv", "\t$dst, $a, $b",
-                 [(set rGPR:$dst, (sdiv rGPR:$a, rGPR:$b))]>,
-                 Requires<[HasDivide]> {
+def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
+                 "sdiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb2]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011100;
   let Inst{20} = 0b1;
@@ -891,10 +1250,10 @@ def t2SDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi,
   let Inst{7-4} = 0b1111;
 }
 
-def t2UDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, 
-                 "udiv", "\t$dst, $a, $b",
-                 [(set rGPR:$dst, (udiv rGPR:$a, rGPR:$b))]>,
-                 Requires<[HasDivide]> {
+def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
+                 "udiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb2]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011101;
   let Inst{20} = 0b1;
@@ -908,26 +1267,26 @@ def t2UDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi,
 
 // Load
 let canFoldAsLoad = 1, isReMaterializable = 1  in
-defm t2LDR   : T2I_ld<0, 0b10, "ldr",  UnOpFrag<(load node:$Src)>>;
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si,
+                      UnOpFrag<(load node:$Src)>>;
 
 // Loads with zero extension
-defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", UnOpFrag<(zextloadi16 node:$Src)>>;
-defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", UnOpFrag<(zextloadi8  node:$Src)>>;
+defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      UnOpFrag<(zextloadi16 node:$Src)>>;
+defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      UnOpFrag<(zextloadi8  node:$Src)>>;
 
 // Loads with sign extension
-defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>;
-defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", UnOpFrag<(sextloadi8  node:$Src)>>;
+defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      UnOpFrag<(sextloadi16 node:$Src)>>;
+defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      UnOpFrag<(sextloadi8  node:$Src)>>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
-def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$dst1, rGPR:$dst2),
+def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
-                        IIC_iLoadi, "ldrd", "\t$dst1, $addr", []>;
-def t2LDRDpci : T2Ii8s4<1, 0, 1, (outs rGPR:$dst1, rGPR:$dst2),
-                        (ins i32imm:$addr), IIC_iLoadi,
-                       "ldrd", "\t$dst1, $addr", []> {
-  let Inst{19-16} = 0b1111; // Rn
-}
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", []>;
 } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
 
 // zextload i1 -> zextload i8
@@ -976,70 +1335,71 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
 //        not via pattern.
 
 // Indexed loads
+
 let mayLoad = 1, neverHasSideEffects = 1 in {
-def t2LDR_PRE  : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+def t2LDR_PRE  : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
-                            "ldr", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn",
                             []>;
 
-def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                            (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
-                          "ldr", "\t$dst, [$base], $offset", "$base = $base_wb",
+def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn),
+                            (ins GPR:$base, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
+                          "ldr", "\t$Rt, [$Rn], $addr", "$base = $Rn",
                             []>;
 
-def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
-                            "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn",
                             []>;
-def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                            (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
-                         "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb",
+def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn),
+                            (ins GPR:$base, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                         "ldrb", "\t$Rt, [$Rn], $addr", "$base = $Rn",
                             []>;
 
-def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
-                            "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn",
                             []>;
-def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                            (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
-                         "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb",
+def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn),
+                            (ins GPR:$base, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                         "ldrh", "\t$Rt, [$Rn], $addr", "$base = $Rn",
                             []>;
 
-def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
-                            "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn",
                             []>;
-def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                            (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
-                        "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb",
+def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn),
+                            (ins GPR:$base, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                        "ldrsb", "\t$Rt, [$Rn], $addr", "$base = $Rn",
                             []>;
 
-def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn),
                             (ins t2addrmode_imm8:$addr),
-                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
-                            "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn",
                             []>;
-def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                            (ins GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
-                        "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb",
+def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn),
+                            (ins GPR:$base, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                        "ldrsh", "\t$dst, [$Rn], $addr", "$base = $Rn",
                             []>;
-} // mayLoad = 1, neverHasSideEffects = 1 
+} // mayLoad = 1, neverHasSideEffects = 1
 
 // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110) and are
 // for disassembly only.
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
-class T2IldT<bit signed, bits<2> type, string opc>
-  : T2Ii8<(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi, opc,
-          "\t$dst, $addr", []> {
+class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = signed;
@@ -1048,74 +1408,83 @@ class T2IldT<bit signed, bits<2> type, string opc>
   let Inst{20} = 1; // load
   let Inst{11} = 1;
   let Inst{10-8} = 0b110; // PUW.
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = addr{12-9};
+  let Inst{7-0}   = addr{7-0};
 }
 
-def t2LDRT   : T2IldT<0, 0b10, "ldrt">;
-def t2LDRBT  : T2IldT<0, 0b00, "ldrbt">;
-def t2LDRHT  : T2IldT<0, 0b01, "ldrht">;
-def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt">;
-def t2LDRSHT : T2IldT<1, 0b01, "ldrsht">;
+def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
+def t2LDRBT  : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>;
+def t2LDRHT  : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
+def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
+def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 
 // Store
-defm t2STR :T2I_st<0b10,"str", BinOpFrag<(store node:$LHS, node:$RHS)>>;
-defm t2STRB:T2I_st<0b00,"strb",BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
-defm t2STRH:T2I_st<0b01,"strh",BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si,
+                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 // Store doubleword
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
-                       (ins GPR:$src1, GPR:$src2, t2addrmode_imm8s4:$addr),
-               IIC_iStorer, "strd", "\t$src1, $addr", []>;
+                       (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr),
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>;
 
 // Indexed stores
 def t2STR_PRE  : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
-                         "str", "\t$src, [$base, $offset]!", "$base = $base_wb",
+                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                         "str", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
              [(set GPR:$base_wb,
-                   (pre_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+                   (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
-                          "str", "\t$src, [$base], $offset", "$base = $base_wb",
+                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
+                          "str", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
              [(set GPR:$base_wb,
-                  (post_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+                  (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRH_PRE  : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
-                        "strh", "\t$src, [$base, $offset]!", "$base = $base_wb",
+                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                        "strh", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
         [(set GPR:$base_wb,
-              (pre_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+              (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
-                         "strh", "\t$src, [$base], $offset", "$base = $base_wb",
+                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strh", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
        [(set GPR:$base_wb,
-             (post_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+             (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRB_PRE  : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
-                        "strb", "\t$src, [$base, $offset]!", "$base = $base_wb",
+                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
+                        "strb", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
          [(set GPR:$base_wb,
-               (pre_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+               (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
-                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
-                         "strb", "\t$src, [$base], $offset", "$base = $base_wb",
+                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strb", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
         [(set GPR:$base_wb,
-              (post_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+              (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 // STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
 // only.
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
-class T2IstT<bits<2> type, string opc>
-  : T2Ii8<(outs GPR:$src), (ins t2addrmode_imm8:$addr), IIC_iStorei, opc,
-          "\t$src, $addr", []> {
+class T2IstT<bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = 0; // not signed
@@ -1124,51 +1493,62 @@ class T2IstT<bits<2> type, string opc>
   let Inst{20} = 0; // store
   let Inst{11} = 1;
   let Inst{10-8} = 0b110; // PUW
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = addr{12-9};
+  let Inst{7-0}   = addr{7-0};
 }
 
-def t2STRT   : T2IstT<0b10, "strt">;
-def t2STRBT  : T2IstT<0b00, "strbt">;
-def t2STRHT  : T2IstT<0b01, "strht">;
+def t2STRT   : T2IstT<0b10, "strt", IIC_iStore_i>;
+def t2STRBT  : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
+def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 
 // ldrd / strd pre / post variants
 // For disassembly only.
 
-def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs GPR:$dst1, GPR:$dst2),
-                 (ins GPR:$base, t2am_imm8s4_offset:$imm), NoItinerary,
-                 "ldrd", "\t$dst1, $dst2, [$base, $imm]!", []>;
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs GPR:$Rt, GPR:$Rt2),
+                 (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
+                 "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
 
-def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$dst1, GPR:$dst2),
-                 (ins GPR:$base, t2am_imm8s4_offset:$imm), NoItinerary,
-                 "ldrd", "\t$dst1, $dst2, [$base], $imm", []>;
+def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$Rt, GPR:$Rt2),
+                 (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
+                 "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>;
 
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs),
-                 (ins GPR:$src1, GPR:$src2, GPR:$base, t2am_imm8s4_offset:$imm),
-                 NoItinerary, "strd", "\t$src1, $src2, [$base, $imm]!", []>;
+                 (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
 
 def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs),
-                 (ins GPR:$src1, GPR:$src2, GPR:$base, t2am_imm8s4_offset:$imm),
-                 NoItinerary, "strd", "\t$src1, $src2, [$base], $imm", []>;
+                 (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>;
 
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
 // data/instruction access.  These are for disassembly only.
-//
-// A8.6.117, A8.6.118.  Different instructions are generated for #0 and #-0.
-// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc.
-multiclass T2Ipl<bit instr, bit write, string opc> {
+// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
+// (prefetch 1) -> (preload 2),  (prefetch 2) -> (preload 1).
+multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
 
-  def i12 : T2I<(outs), (ins GPR:$base, i32imm:$imm), IIC_iLoadi, opc,
-                "\t[$base, $imm]", []> {
+  def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc,
+                "\t$addr",
+              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
-    let Inst{23} = 1; // U = 1
     let Inst{22} = 0;
     let Inst{21} = write;
     let Inst{20} = 1;
     let Inst{15-12} = 0b1111;
+
+    bits<17> addr;
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{23}    = addr{12};    // U
+    let Inst{11-0}  = addr{11-0};  // imm12
   }
 
-  def i8 : T2I<(outs), (ins GPR:$base, neg_zero:$imm), IIC_iLoadi, opc,
-                "\t[$base, $imm]", []> {
+  def i8 : T2Ii8<(outs), (ins t2addrmode_imm8:$addr), IIC_Preload, opc,
+                "\t$addr",
+               [(ARMPreload t2addrmode_imm8:$addr, (i32 write), (i32 instr))]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // U = 0
@@ -1177,22 +1557,15 @@ multiclass T2Ipl<bit instr, bit write, string opc> {
     let Inst{20} = 1;
     let Inst{15-12} = 0b1111;
     let Inst{11-8} = 0b1100;
-  }
 
-  def pci : T2I<(outs), (ins GPR:$base, neg_zero:$imm), IIC_iLoadi, opc,
-                "\t[pc, $imm]", []> {
-    let Inst{31-25} = 0b1111100;
-    let Inst{24} = instr;
-    let Inst{23} = ?; // add = (U == 1)
-    let Inst{22} = 0;
-    let Inst{21} = write;
-    let Inst{20} = 1;
-    let Inst{19-16} = 0b1111; // Rn = 0b1111
-    let Inst{15-12} = 0b1111;
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{7-0}   = addr{7-0};  // imm8
   }
 
-  def r   : T2I<(outs), (ins GPR:$base, GPR:$a), IIC_iLoadi, opc,
-                "\t[$base, $a]", []> {
+  def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
+               "\t$addr",
+             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // add = TRUE for T1
@@ -1201,133 +1574,174 @@ multiclass T2Ipl<bit instr, bit write, string opc> {
     let Inst{20} = 1;
     let Inst{15-12} = 0b1111;
     let Inst{11-6} = 0000000;
-    let Inst{5-4} = 0b00; // no shift is applied
-  }
 
-  def s   : T2I<(outs), (ins GPR:$base, GPR:$a, i32imm:$shamt), IIC_iLoadi, opc,
-                "\t[$base, $a, lsl $shamt]", []> {
-    let Inst{31-25} = 0b1111100;
-    let Inst{24} = instr;
-    let Inst{23} = 0; // add = TRUE for T1
-    let Inst{22} = 0;
-    let Inst{21} = write;
-    let Inst{20} = 1;
-    let Inst{15-12} = 0b1111;
-    let Inst{11-6} = 0000000;
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm2
   }
 }
 
-defm t2PLD  : T2Ipl<0, 0, "pld">;
-defm t2PLDW : T2Ipl<0, 1, "pldw">;
-defm t2PLI  : T2Ipl<1, 0, "pli">;
+defm t2PLD  : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
+defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
+defm t2PLI  : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
-def t2LDM : T2XI<(outs), (ins addrmode4:$addr, pred:$p,
-                          reglist:$dsts, variable_ops), IIC_iLoadm,
-                 "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b00;
-  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
-  let Inst{22} = 0;
-  let Inst{21} = 0; // The W bit.
-  let Inst{20} = 1; // Load
-}
+multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "ia${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
 
-def t2LDM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                       reglist:$dsts, variable_ops), IIC_iLoadm,
-                      "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts",
-                      "$addr.addr = $wb", []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b00;
-  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
-  let Inst{22} = 0;
-  let Inst{21} = 1; // The W bit.
-  let Inst{20} = 1; // Load
-}
-} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "ia${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
 
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-def t2STM : T2XI<(outs), (ins addrmode4:$addr, pred:$p,
-                          reglist:$srcs, variable_ops), IIC_iStorem,
-                 "stm${addr:submode}${p}${addr:wide}\t$addr, $srcs", []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b00;
-  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
-  let Inst{22} = 0;
-  let Inst{21} = 0; // The W bit.
-  let Inst{20} = 0; // Store
-}
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
 
-def t2STM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                       reglist:$srcs, variable_ops),
-                      IIC_iStorem,
-                      "stm${addr:submode}${p}${addr:wide}\t$addr!, $srcs",
-                      "$addr.addr = $wb", []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b00;
-  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
-  let Inst{22} = 0;
-  let Inst{21} = 1; // The W bit.
-  let Inst{20} = 0; // Store
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}.w\t$Rn, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
 }
-} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm t2LDM : thumb2_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm t2STM : thumb2_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
+
+} // neverHasSideEffects
+
 
 //===----------------------------------------------------------------------===//
 //  Move Instructions.
 //
 
 let neverHasSideEffects = 1 in
-def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
-                   "mov", ".w\t$dst, $src", []> {
+def t2MOVr : T2sTwoReg<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+                   "mov", ".w\t$Rd, $Rm", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
-  let Inst{20} = ?; // The S bit.
   let Inst{19-16} = 0b1111; // Rn
   let Inst{14-12} = 0b000;
   let Inst{7-4} = 0b0000;
 }
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in
-def t2MOVi : T2sI<(outs rGPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
-                   "mov", ".w\t$dst, $src",
-                   [(set rGPR:$dst, t2_so_imm:$src)]> {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    AddedComplexity = 1 in
+def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
+                   "mov", ".w\t$Rd, $imm",
+                   [(set rGPR:$Rd, t2_so_imm:$imm)]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b0010;
-  let Inst{20} = ?; // The S bit.
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
 }
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def t2MOVi16 : T2I<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
-                   "movw", "\t$dst, $src",
-                   [(set rGPR:$dst, imm0_65535:$src)]> {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins i32imm_hilo16:$imm), IIC_iMOVi,
+                   "movw", "\t$Rd, $imm",
+                   [(set rGPR:$Rd, imm0_65535:$imm)]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0010;
   let Inst{20} = 0; // The S bit.
   let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
 }
 
-let Constraints = "$src = $dst" in
-def t2MOVTi16 : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$imm), IIC_iMOVi,
-                    "movt", "\t$dst, $imm",
-                    [(set rGPR:$dst,
+def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+let Constraints = "$src = $Rd" in {
+def t2MOVTi16 : T2I<(outs rGPR:$Rd),
+                    (ins rGPR:$src, i32imm_hilo16:$imm), IIC_iMOVi,
+                    "movt", "\t$Rd, $imm",
+                    [(set rGPR:$Rd,
                           (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0110;
   let Inst{20} = 0; // The S bit.
   let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
 }
 
+def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+} // Constraints
+
 def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 //===----------------------------------------------------------------------===//
@@ -1336,28 +1750,28 @@ def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 // Sign extenders
 
-defm t2SXTB  : T2I_unary_rrot<0b100, "sxtb",
+defm t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
                               UnOpFrag<(sext_inreg node:$Src, i8)>>;
-defm t2SXTH  : T2I_unary_rrot<0b000, "sxth",
+defm t2SXTH  : T2I_ext_rrot<0b000, "sxth",
                               UnOpFrag<(sext_inreg node:$Src, i16)>>;
-defm t2SXTB16 : T2I_unary_rrot_sxtb16<0b010, "sxtb16">;
+defm t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
 
-defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab",
+defm t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-defm t2SXTAH : T2I_bin_rrot<0b000, "sxtah",
+defm t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
-defm t2SXTAB16 : T2I_bin_rrot_DO<0b010, "sxtab16">;
+defm t2SXTAB16 : T2I_exta_rrot_DO<0b010, "sxtab16">;
 
 // TODO: SXT(A){B|H}16 - done for disassembly only
 
 // Zero extenders
 
 let AddedComplexity = 16 in {
-defm t2UXTB   : T2I_unary_rrot<0b101, "uxtb",
+defm t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
                                UnOpFrag<(and node:$Src, 0x000000FF)>>;
-defm t2UXTH   : T2I_unary_rrot<0b001, "uxth",
+defm t2UXTH   : T2I_ext_rrot<0b001, "uxth",
                                UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16",
+defm t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
                                UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
@@ -1365,15 +1779,17 @@ defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16",
 //        instead so we can include a check for masking back in the upper
 //        eight bits of the source into the lower eight bits of the result.
 //def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
-//            (t2UXTB16r_rot rGPR:$Src, 24)>, Requires<[HasT2ExtractPack]>;
+//            (t2UXTB16r_rot rGPR:$Src, 24)>,
+//          Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot rGPR:$Src, 8)>, Requires<[HasT2ExtractPack]>;
+            (t2UXTB16r_rot rGPR:$Src, 8)>,
+        Requires<[HasT2ExtractPack, IsThumb2]>;
 
-defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab",
+defm t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-defm t2UXTAH : T2I_bin_rrot<0b001, "uxtah",
+defm t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
-defm t2UXTAB16 : T2I_bin_rrot_DO<0b011, "uxtab16">;
+defm t2UXTAB16 : T2I_exta_rrot_DO<0b011, "uxtab16">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1387,8 +1803,10 @@ defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
 
 // ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
 defm t2ADDS : T2I_bin_s_irs <0b1000, "add",
+                             IIC_iALUi, IIC_iALUr, IIC_iALUsi,
                              BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
 defm t2SUBS : T2I_bin_s_irs <0b1101, "sub",
+                             IIC_iALUi, IIC_iALUr, IIC_iALUsi,
                              BinOpFrag<(subc node:$LHS, node:$RHS)>>;
 
 defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
@@ -1436,8 +1854,8 @@ def : T2Pat<(adde       rGPR:$src, t2_so_imm_not:$imm),
 
 // Select Bytes -- for disassembly only
 
-def t2SEL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, "sel",
-                "\t$dst, $a, $b", []> {
+def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-24} = 0b010;
   let Inst{23} = 0b1;
@@ -1450,28 +1868,41 @@ def t2SEL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, "sel",
 // A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
 // And Miscellaneous operations -- for disassembly only
 class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
-              list<dag> pat = [/* For disassembly only; pattern left blank */]>
-  : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), NoItinerary, opc,
-        "\t$dst, $a, $b", pat> {
+              list<dag> pat = [/* For disassembly only; pattern left blank */],
+              dag iops = (ins rGPR:$Rn, rGPR:$Rm),
+              string asm = "\t$Rd, $Rn, $Rm">
+  : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0101;
   let Inst{22-20} = op22_20;
   let Inst{15-12} = 0b1111;
   let Inst{7-4} = op7_4;
+
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
 }
 
 // Saturating add/subtract -- for disassembly only
 
 def t2QADD    : T2I_pam<0b000, 0b1000, "qadd",
-                        [(set rGPR:$dst, (int_arm_qadd rGPR:$a, rGPR:$b))]>;
+                        [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
 def t2QADD16  : T2I_pam<0b001, 0b0001, "qadd16">;
 def t2QADD8   : T2I_pam<0b000, 0b0001, "qadd8">;
 def t2QASX    : T2I_pam<0b010, 0b0001, "qasx">;
-def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd">;
-def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub">;
+def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
 def t2QSAX    : T2I_pam<0b110, 0b0001, "qsax">;
 def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub",
-                        [(set rGPR:$dst, (int_arm_qsub rGPR:$a, rGPR:$b))]>;
+                        [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
 def t2QSUB16  : T2I_pam<0b101, 0b0001, "qsub16">;
 def t2QSUB8   : T2I_pam<0b100, 0b0001, "qsub8">;
 def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
@@ -1511,21 +1942,61 @@ def t2UHSAX   : T2I_pam<0b110, 0b0110, "uhsax">;
 def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">;
 def t2UHSUB8  : T2I_pam<0b100, 0b0110, "uhsub8">;
 
+// Helper class for disassembly only
+// A6.3.16 & A6.3.17
+// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions.
+class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
+class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2FourReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
 // Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
 
-def t2USAD8   : T2I_mac<0, 0b111, 0b0000, (outs rGPR:$dst),
-                                           (ins rGPR:$a, rGPR:$b),
-                        NoItinerary, "usad8", "\t$dst, $a, $b", []> {
+def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                                           (ins rGPR:$Rn, rGPR:$Rm),
+                        NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2USADA8  : T2I_mac<0, 0b111, 0b0000, (outs rGPR:$dst),
-                       (ins rGPR:$a, rGPR:$b, rGPR:$acc), NoItinerary, "usada8",
-                        "\t$dst, $a, $b, $acc", []>;
+def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                       (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
+                        "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>;
 
 // Signed/Unsigned saturate -- for disassembly only
 
-def t2SSAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh),
-                NoItinerary, "ssat", "\t$dst, $bit_pos, $a$sh",
+class T2SatI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> sat_imm;
+  bits<7> sh;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{4-0}   = sat_imm{4-0};
+  let Inst{21}    = sh{6};
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+def t2SSAT: T2SatI<
+                (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
+                NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
@@ -1533,8 +2004,9 @@ def t2SSAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh),
   let Inst{15} = 0;
 }
 
-def t2SSAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary,
-                   "ssat16", "\t$dst, $bit_pos, $a",
+def t2SSAT16: T2SatI<
+                   (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary,
+                   "ssat16", "\t$Rd, $sat_imm, $Rn",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
@@ -1545,8 +2017,9 @@ def t2SSAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary,
   let Inst{7-6} = 0b00;    // imm2 = '00'
 }
 
-def t2USAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh),
-                NoItinerary, "usat", "\t$dst, $bit_pos, $a$sh",
+def t2USAT: T2SatI<
+                (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
@@ -1554,8 +2027,9 @@ def t2USAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh),
   let Inst{15} = 0;
 }
 
-def t2USAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary,
-                   "usat16", "\t$dst, $bit_pos, $a",
+def t2USAT16: T2SatI<
+                    (outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary,
+                   "usat16", "\t$dst, $sat_imm, $Rn",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
@@ -1579,23 +2053,23 @@ defm t2ASR  : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
 
 let Uses = [CPSR] in {
-def t2MOVrx : T2sI<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
-                   "rrx", "\t$dst, $src",
-                   [(set rGPR:$dst, (ARMrrx rGPR:$src))]> {
+def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                   "rrx", "\t$Rd, $Rm",
+                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
-  let Inst{20} = ?; // The S bit.
   let Inst{19-16} = 0b1111; // Rn
   let Inst{14-12} = 0b000;
   let Inst{7-4} = 0b0011;
 }
 }
 
-let Defs = [CPSR] in {
-def t2MOVsrl_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
-                        "lsrs", ".w\t$dst, $src, #1",
-                        [(set rGPR:$dst, (ARMsrl_flag rGPR:$src))]> {
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+def t2MOVsrl_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "lsrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -1606,9 +2080,10 @@ def t2MOVsrl_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
   let Inst{14-12} = 0b000;
   let Inst{7-6} = 0b01;
 }
-def t2MOVsra_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
-                        "asrs", ".w\t$dst, $src, #1",
-                        [(set rGPR:$dst, (ARMsra_flag rGPR:$src))]> {
+def t2MOVsra_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "asrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -1626,39 +2101,67 @@ def t2MOVsra_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
 //
 
 defm t2AND  : T2I_bin_w_irs<0b0000, "and",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
                             BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
 defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
                             BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
 defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
                             BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
 
 defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
                             BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
-defm t2ANDS : T2I_bin_s_irs<0b0000, "and",
-                            BinOpFrag<(ARMand node:$LHS, node:$RHS)>, 1>;
+class T2BitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<5> msb;
+  bits<5> lsb;
+
+  let Inst{11-8}  = Rd;
+  let Inst{4-0}   = msb{4-0};
+  let Inst{14-12} = lsb{4-2};
+  let Inst{7-6}   = lsb{1-0};
+}
+
+class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2BitFI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
 
-let Constraints = "$src = $dst" in
-def t2BFC : T2I<(outs rGPR:$dst), (ins rGPR:$src, bf_inv_mask_imm:$imm),
-                IIC_iUNAsi, "bfc", "\t$dst, $imm",
-                [(set rGPR:$dst, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
+  let Inst{19-16} = Rn;
+}
+
+let Constraints = "$src = $Rd" in
+def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
+                IIC_iUNAsi, "bfc", "\t$Rd, $imm",
+                [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-20} = 0b10110;
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
+
+  bits<10> imm;
+  let msb{4-0} = imm{9-5};
+  let lsb{4-0} = imm{4-0};
 }
 
-def t2SBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width),
-                 IIC_iALUi, "sbfx", "\t$dst, $src, $lsb, $width", []> {
+def t2SBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb),
+                 IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-20} = 0b10100;
   let Inst{15} = 0;
 }
 
-def t2UBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width),
-                 IIC_iALUi, "ubfx", "\t$dst, $src, $lsb, $width", []> {
+def t2UBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb),
+                 IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-20} = 0b11100;
@@ -1666,24 +2169,50 @@ def t2UBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width),
 }
 
 // A8.6.18  BFI - Bitfield insert (Encoding T1)
-let Constraints = "$src = $dst" in
-def t2BFI : T2I<(outs rGPR:$dst),
-                (ins rGPR:$src, rGPR:$val, bf_inv_mask_imm:$imm),
-                IIC_iALUi, "bfi", "\t$dst, $val, $imm",
-                [(set rGPR:$dst, (ARMbfi rGPR:$src, rGPR:$val,
-                                 bf_inv_mask_imm:$imm))]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25} = 1;
-  let Inst{24-20} = 0b10110;
-  let Inst{15} = 0;
+let Constraints = "$src = $Rd" in {
+  def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd),
+                  (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm),
+                  IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm",
+                  [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
+                                   bf_inv_mask_imm:$imm))]> {
+    let Inst{31-27} = 0b11110;
+    let Inst{25} = 1;
+    let Inst{24-20} = 0b10110;
+    let Inst{15} = 0;
+
+    bits<10> imm;
+    let msb{4-0} = imm{9-5};
+    let lsb{4-0} = imm{4-0};
+  }
+
+  // GNU as only supports this form of bfi (w/ 4 arguments)
+  let isAsmParserOnly = 1 in
+  def t2BFI4p : T2TwoRegBitFI<(outs rGPR:$Rd),
+                  (ins rGPR:$src, rGPR:$Rn, lsb_pos_imm:$lsbit,
+                       width_imm:$width),
+                  IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width",
+                  []> {
+    let Inst{31-27} = 0b11110;
+    let Inst{25} = 1;
+    let Inst{24-20} = 0b10110;
+    let Inst{15} = 0;
+
+    bits<5> lsbit;
+    bits<5> width;
+    let msb{4-0} = width; // Custom encoder => lsb+width-1
+    let lsb{4-0} = lsbit;
+  }
 }
 
-defm t2ORN  : T2I_bin_irs<0b0011, "orn", BinOpFrag<(or  node:$LHS,
-                          (not node:$RHS))>, 0, "">;
+defm t2ORN  : T2I_bin_irs<0b0011, "orn",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                          BinOpFrag<(or  node:$LHS, (not node:$RHS))>, 0, "">;
 
 // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
 let AddedComplexity = 1 in
-defm t2MVN  : T2I_un_irs <0b0011, "mvn", UnOpFrag<(not node:$Src)>, 1, 1>;
+defm t2MVN  : T2I_un_irs <0b0011, "mvn",
+                          IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
+                          UnOpFrag<(not node:$Src)>, 1, 1>;
 
 
 let AddedComplexity = 1 in
@@ -1702,9 +2231,9 @@ def : T2Pat<(t2_so_imm_not:$src),
 //  Multiply Instructions.
 //
 let isCommutable = 1 in
-def t2MUL: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
-                "mul", "\t$dst, $a, $b",
-                [(set rGPR:$dst, (mul rGPR:$a, rGPR:$b))]> {
+def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                "mul", "\t$Rd, $Rn, $Rm",
+                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -1712,83 +2241,63 @@ def t2MUL: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
   let Inst{7-4} = 0b0000; // Multiply
 }
 
-def t2MLA: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
-		"mla", "\t$dst, $a, $b, $c",
-		[(set rGPR:$dst, (add (mul rGPR:$a, rGPR:$b), rGPR:$c))]> {
+def t2MLA: T2FourReg<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "mla", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
-  let Inst{15-12} = {?, ?, ?, ?}; // Ra
   let Inst{7-4} = 0b0000; // Multiply
 }
 
-def t2MLS: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
-		"mls", "\t$dst, $a, $b, $c",
-                [(set rGPR:$dst, (sub rGPR:$c, (mul rGPR:$a, rGPR:$b)))]> {
+def t2MLS: T2FourReg<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "mls", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
-  let Inst{15-12} = {?, ?, ?, ?}; // Ra
   let Inst{7-4} = 0b0001; // Multiply and Subtract
 }
 
 // Extra precision multiplies with low / high results
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
-def t2SMULL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
-                  (ins rGPR:$a, rGPR:$b), IIC_iMUL64,
-                   "smull", "\t$ldst, $hdst, $a, $b", []> {
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0111;
-  let Inst{22-20} = 0b000;
-  let Inst{7-4} = 0b0000;
-}
-
-def t2UMULL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
-                  (ins rGPR:$a, rGPR:$b), IIC_iMUL64,
-                   "umull", "\t$ldst, $hdst, $a, $b", []> {
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0111;
-  let Inst{22-20} = 0b010;
-  let Inst{7-4} = 0b0000;
-}
+def t2SMULL : T2MulLong<0b000, 0b0000,
+                  (outs rGPR:$Rd, rGPR:$Ra),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+                   "smull", "\t$Rd, $Ra, $Rn, $Rm", []>;
+
+def t2UMULL : T2MulLong<0b010, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+                   "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
 } // isCommutable
 
 // Multiply + accumulate
-def t2SMLAL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
-                  (ins rGPR:$a, rGPR:$b), IIC_iMAC64,
-                  "smlal", "\t$ldst, $hdst, $a, $b", []>{
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0111;
-  let Inst{22-20} = 0b100;
-  let Inst{7-4} = 0b0000;
-}
-
-def t2UMLAL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
-                  (ins rGPR:$a, rGPR:$b), IIC_iMAC64,
-                  "umlal", "\t$ldst, $hdst, $a, $b", []>{
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0111;
-  let Inst{22-20} = 0b110;
-  let Inst{7-4} = 0b0000;
-}
-
-def t2UMAAL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
-                  (ins rGPR:$a, rGPR:$b), IIC_iMAC64,
-                  "umaal", "\t$ldst, $hdst, $a, $b", []>{
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0111;
-  let Inst{22-20} = 0b110;
-  let Inst{7-4} = 0b0110;
-}
+def t2SMLAL : T2MulLong<0b100, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
+                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+
+def t2UMLAL : T2MulLong<0b110, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
+                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+
+def t2UMAAL : T2MulLong<0b110, 0b0110,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
+                  "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
 } // neverHasSideEffects
 
 // Rounding variants of the below included for disassembly only
 
 // Most significant word multiply
-def t2SMMUL : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
-                  "smmul", "\t$dst, $a, $b",
-                  [(set rGPR:$dst, (mulhs rGPR:$a, rGPR:$b))]> {
+def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                  "smmul", "\t$Rd, $Rn, $Rm",
+                  [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -1796,8 +2305,8 @@ def t2SMMUL : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
-def t2SMMULR : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
-                  "smmulr", "\t$dst, $a, $b", []> {
+def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                  "smmulr", "\t$Rd, $Rn, $Rm", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -1805,49 +2314,49 @@ def t2SMMULR : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
   let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
 }
 
-def t2SMMLA : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
-                  "smmla", "\t$dst, $a, $b, $c",
-                  [(set rGPR:$dst, (add (mulhs rGPR:$a, rGPR:$b), rGPR:$c))]> {
+def t2SMMLA : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
-  let Inst{15-12} = {?, ?, ?, ?}; // Ra
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
-def t2SMMLAR: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
-                  "smmlar", "\t$dst, $a, $b, $c", []> {
+def t2SMMLAR: T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                  "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
-  let Inst{15-12} = {?, ?, ?, ?}; // Ra
   let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
 }
 
-def t2SMMLS: T2I <(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
-                   "smmls", "\t$dst, $a, $b, $c",
-                   [(set rGPR:$dst, (sub rGPR:$c, (mulhs rGPR:$a, rGPR:$b)))]> {
+def t2SMMLS: T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmls", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b110;
-  let Inst{15-12} = {?, ?, ?, ?}; // Ra
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
-def t2SMMLSR:T2I <(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
-                   "smmlsr", "\t$dst, $a, $b, $c", []> {
+def t2SMMLSR:T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b110;
-  let Inst{15-12} = {?, ?, ?, ?}; // Ra
   let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
 }
 
 multiclass T2I_smul<string opc, PatFrag opnode> {
-  def BB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
-              !strconcat(opc, "bb"), "\t$dst, $a, $b",
-              [(set rGPR:$dst, (opnode (sext_inreg rGPR:$a, i16),
-                                      (sext_inreg rGPR:$b, i16)))]> {
+  def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
+                                      (sext_inreg rGPR:$Rm, i16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1856,10 +2365,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b00;
   }
 
-  def BT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
-              !strconcat(opc, "bt"), "\t$dst, $a, $b",
-              [(set rGPR:$dst, (opnode (sext_inreg rGPR:$a, i16),
-                                      (sra rGPR:$b, (i32 16))))]> {
+  def BT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
+                                      (sra rGPR:$Rm, (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1868,10 +2377,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b01;
   }
 
-  def TB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
-              !strconcat(opc, "tb"), "\t$dst, $a, $b",
-              [(set rGPR:$dst, (opnode (sra rGPR:$a, (i32 16)),
-                                      (sext_inreg rGPR:$b, i16)))]> {
+  def TB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
+                                      (sext_inreg rGPR:$Rm, i16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1880,10 +2389,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b10;
   }
 
-  def TT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
-              !strconcat(opc, "tt"), "\t$dst, $a, $b",
-              [(set rGPR:$dst, (opnode (sra rGPR:$a, (i32 16)),
-                                      (sra rGPR:$b, (i32 16))))]> {
+  def TT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
+                                      (sra rGPR:$Rm, (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1892,10 +2401,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b11;
   }
 
-  def WB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL16,
-              !strconcat(opc, "wb"), "\t$dst, $a, $b",
-              [(set rGPR:$dst, (sra (opnode rGPR:$a,
-                                    (sext_inreg rGPR:$b, i16)), (i32 16)))]> {
+  def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
+                                    (sext_inreg rGPR:$Rm, i16)), (i32 16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1904,10 +2413,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b00;
   }
 
-  def WT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL16,
-              !strconcat(opc, "wt"), "\t$dst, $a, $b",
-              [(set rGPR:$dst, (sra (opnode rGPR:$a,
-                                    (sra rGPR:$b, (i32 16))), (i32 16)))]> {
+  def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
+                                    (sra rGPR:$Rm, (i32 16))), (i32 16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1919,75 +2428,75 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
 
 
 multiclass T2I_smla<string opc, PatFrag opnode> {
-  def BB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
-              !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc",
-              [(set rGPR:$dst, (add rGPR:$acc,
-                               (opnode (sext_inreg rGPR:$a, i16),
-                                       (sext_inreg rGPR:$b, i16))))]> {
+  def BB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra,
+                               (opnode (sext_inreg rGPR:$Rn, i16),
+                                       (sext_inreg rGPR:$Rm, i16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
-    let Inst{15-12} = {?, ?, ?, ?}; // Ra
     let Inst{7-6} = 0b00;
     let Inst{5-4} = 0b00;
   }
 
-  def BT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
-             !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc",
-             [(set rGPR:$dst, (add rGPR:$acc, (opnode (sext_inreg rGPR:$a, i16),
-                                                  (sra rGPR:$b, (i32 16)))))]> {
+  def BT : T2FourReg<
+       (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+             !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16),
+                                                 (sra rGPR:$Rm, (i32 16)))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
-    let Inst{15-12} = {?, ?, ?, ?}; // Ra
     let Inst{7-6} = 0b00;
     let Inst{5-4} = 0b01;
   }
 
-  def TB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
-              !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc",
-              [(set rGPR:$dst, (add rGPR:$acc, (opnode (sra rGPR:$a, (i32 16)),
-                                                (sext_inreg rGPR:$b, i16))))]> {
+  def TB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
+                                               (sext_inreg rGPR:$Rm, i16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
-    let Inst{15-12} = {?, ?, ?, ?}; // Ra
     let Inst{7-6} = 0b00;
     let Inst{5-4} = 0b10;
   }
 
-  def TT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
-              !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc",
-             [(set rGPR:$dst, (add rGPR:$acc, (opnode (sra rGPR:$a, (i32 16)),
-                                                  (sra rGPR:$b, (i32 16)))))]> {
+  def TT : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
+                                                 (sra rGPR:$Rm, (i32 16)))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
-    let Inst{15-12} = {?, ?, ?, ?}; // Ra
     let Inst{7-6} = 0b00;
     let Inst{5-4} = 0b11;
   }
 
-  def WB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
-              !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc",
-              [(set rGPR:$dst, (add rGPR:$acc, (sra (opnode rGPR:$a,
-                                     (sext_inreg rGPR:$b, i16)), (i32 16))))]> {
+  def WB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
+                                    (sext_inreg rGPR:$Rm, i16)), (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
-    let Inst{15-12} = {?, ?, ?, ?}; // Ra
     let Inst{7-6} = 0b00;
     let Inst{5-4} = 0b00;
   }
 
-  def WT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
-              !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc",
-              [(set rGPR:$dst, (add rGPR:$acc, (sra (opnode rGPR:$a,
-                                       (sra rGPR:$b, (i32 16))), (i32 16))))]> {
+  def WT : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
+                                      (sra rGPR:$Rm, (i32 16))), (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
-    let Inst{15-12} = {?, ?, ?, ?}; // Ra
     let Inst{7-6} = 0b00;
     let Inst{5-4} = 0b01;
   }
@@ -1997,62 +2506,68 @@ defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
 // Halfword multiple accumulate long: SMLAL<x><y> -- for disassembly only
-def t2SMLALBB : T2I_mac<1, 0b100, 0b1000, (outs rGPR:$ldst,rGPR:$hdst),
-         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b",
+def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>;
-def t2SMLALBT : T2I_mac<1, 0b100, 0b1001, (outs rGPR:$ldst,rGPR:$hdst),
-         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b",
+def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>;
-def t2SMLALTB : T2I_mac<1, 0b100, 0b1010, (outs rGPR:$ldst,rGPR:$hdst),
-         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b",
+def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>;
-def t2SMLALTT : T2I_mac<1, 0b100, 0b1011, (outs rGPR:$ldst,rGPR:$hdst),
-         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b",
+def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>;
 
 // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
 // These are for disassembly only.
 
-def t2SMUAD: T2I_mac<0, 0b010, 0b0000, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
-                     IIC_iMAC32, "smuad", "\t$dst, $a, $b", []> {
+def t2SMUAD: T2ThreeReg_mac<
+            0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMUADX:T2I_mac<0, 0b010, 0b0001, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
-                     IIC_iMAC32, "smuadx", "\t$dst, $a, $b", []> {
+def t2SMUADX:T2ThreeReg_mac<
+            0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMUSD: T2I_mac<0, 0b100, 0b0000, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
-                     IIC_iMAC32, "smusd", "\t$dst, $a, $b", []> {
+def t2SMUSD: T2ThreeReg_mac<
+            0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMUSDX:T2I_mac<0, 0b100, 0b0001, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
-                     IIC_iMAC32, "smusdx", "\t$dst, $a, $b", []> {
+def t2SMUSDX:T2ThreeReg_mac<
+            0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMLAD   : T2I_mac<0, 0b010, 0b0000, (outs rGPR:$dst),
-                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlad",
-                        "\t$dst, $a, $b, $acc", []>;
-def t2SMLADX  : T2I_mac<0, 0b010, 0b0001, (outs rGPR:$dst),
-                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smladx",
-                        "\t$dst, $a, $b, $acc", []>;
-def t2SMLSD   : T2I_mac<0, 0b100, 0b0000, (outs rGPR:$dst),
-                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlsd",
-                        "\t$dst, $a, $b, $acc", []>;
-def t2SMLSDX  : T2I_mac<0, 0b100, 0b0001, (outs rGPR:$dst),
-                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlsdx",
-                        "\t$dst, $a, $b, $acc", []>;
-def t2SMLALD  : T2I_mac<1, 0b100, 0b1100, (outs rGPR:$ldst,rGPR:$hdst),
-                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlald",
-                        "\t$ldst, $hdst, $a, $b", []>;
-def t2SMLALDX : T2I_mac<1, 0b100, 0b1101, (outs rGPR:$ldst,rGPR:$hdst),
-                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaldx",
-                        "\t$ldst, $hdst, $a, $b", []>;
-def t2SMLSLD  : T2I_mac<1, 0b101, 0b1100, (outs rGPR:$ldst,rGPR:$hdst),
-                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlsld",
-                        "\t$ldst, $hdst, $a, $b", []>;
-def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs rGPR:$ldst,rGPR:$hdst),
-                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlsldx",
-                        "\t$ldst, $hdst, $a, $b", []>;
+def t2SMLAD   : T2ThreeReg_mac<
+            0, 0b010, 0b0000, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad",
+            "\t$Rd, $Rn, $Rm, $Ra", []>;
+def t2SMLADX  : T2FourReg_mac<
+            0, 0b010, 0b0001, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx",
+            "\t$Rd, $Rn, $Rm, $Ra", []>;
+def t2SMLSD   : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd",
+            "\t$Rd, $Rn, $Rm, $Ra", []>;
+def t2SMLSDX  : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx",
+            "\t$Rd, $Rn, $Rm, $Ra", []>;
+def t2SMLALD  : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rm, rGPR:$Rn), IIC_iMAC64, "smlald",
+                        "\t$Ra, $Rd, $Rm, $Rn", []>;
+def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlaldx",
+                        "\t$Ra, $Rd, $Rm, $Rn", []>;
+def t2SMLSLD  : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsld",
+                        "\t$Ra, $Rd, $Rm, $Rn", []>;
+def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx",
+                        "\t$Ra, $Rd, $Rm, $Rn", []>;
 
 //===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
@@ -2060,99 +2575,117 @@ def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs rGPR:$ldst,rGPR:$hdst),
 
 class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
       InstrItinClass itin, string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
   let Inst{31-27} = 0b11111;
   let Inst{26-22} = 0b01010;
   let Inst{21-20} = op1;
   let Inst{15-12} = 0b1111;
   let Inst{7-6} = 0b10;
   let Inst{5-4} = op2;
+  let Rn{3-0} = Rm;
 }
 
-def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                    "clz", "\t$dst, $src", [(set rGPR:$dst, (ctlz rGPR:$src))]>;
+def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>;
 
-def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                      "rbit", "\t$dst, $src",
-                      [(set rGPR:$dst, (ARMrbit rGPR:$src))]>;
+def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                      "rbit", "\t$Rd, $Rm",
+                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>;
 
-def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                 "rev", ".w\t$dst, $src", [(set rGPR:$dst, (bswap rGPR:$src))]>;
+def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>;
 
-def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                       "rev16", ".w\t$dst, $src",
-                [(set rGPR:$dst,
-                    (or (and (srl rGPR:$src, (i32 8)), 0xFF),
-                        (or (and (shl rGPR:$src, (i32 8)), 0xFF00),
-                            (or (and (srl rGPR:$src, (i32 8)), 0xFF0000),
-                               (and (shl rGPR:$src, (i32 8)), 0xFF000000)))))]>;
+def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "rev16", ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd,
+                    (or (and (srl rGPR:$Rm, (i32 8)), 0xFF),
+                        (or (and (shl rGPR:$Rm, (i32 8)), 0xFF00),
+                            (or (and (srl rGPR:$Rm, (i32 8)), 0xFF0000),
+                               (and (shl rGPR:$Rm, (i32 8)), 0xFF000000)))))]>;
 
-def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
-                       "revsh", ".w\t$dst, $src",
-                 [(set rGPR:$dst,
+def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "revsh", ".w\t$Rd, $Rm",
+                 [(set rGPR:$Rd,
                     (sext_inreg
-                      (or (srl (and rGPR:$src, 0xFF00), (i32 8)),
-                          (shl rGPR:$src, (i32 8))), i16))]>;
-
-def t2PKHBT : T2I<(outs rGPR:$dst), (ins rGPR:$src1, rGPR:$src2, shift_imm:$sh),
-                  IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2$sh",
-                  [(set rGPR:$dst, (or (and rGPR:$src1, 0xFFFF),
-                                      (and (shl rGPR:$src2, lsl_amt:$sh),
+                      (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)),
+                          (shl rGPR:$Rm, (i32 8))), i16))]>;
+
+def t2PKHBT : T2ThreeReg<
+            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh),
+                  IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
+                                      (and (shl rGPR:$Rm, lsl_amt:$sh),
                                            0xFFFF0000)))]>,
-                  Requires<[HasT2ExtractPack]> {
+                  Requires<[HasT2ExtractPack, IsThumb2]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-20} = 0b01100;
   let Inst{5} = 0; // BT form
   let Inst{4} = 0;
+
+  bits<8> sh;
+  let Inst{14-12} = sh{7-5};
+  let Inst{7-6}   = sh{4-3};
 }
 
 // Alternate cases for PKHBT where identities eliminate some nodes.
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
-            Requires<[HasT2ExtractPack]>;
+            Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, (lsl_shift_imm imm16_31:$sh))>,
-            Requires<[HasT2ExtractPack]>;
+            Requires<[HasT2ExtractPack, IsThumb2]>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
-def t2PKHTB : T2I<(outs rGPR:$dst), (ins rGPR:$src1, rGPR:$src2, shift_imm:$sh),
-                  IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2$sh",
-                  [(set rGPR:$dst, (or (and rGPR:$src1, 0xFFFF0000),
-                                       (and (sra rGPR:$src2, asr_amt:$sh),
+def t2PKHTB : T2ThreeReg<
+                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh),
+                  IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
+                                       (and (sra rGPR:$Rm, asr_amt:$sh),
                                             0xFFFF)))]>,
-                  Requires<[HasT2ExtractPack]> {
+                  Requires<[HasT2ExtractPack, IsThumb2]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-20} = 0b01100;
   let Inst{5} = 1; // TB form
   let Inst{4} = 0;
+
+  bits<8> sh;
+  let Inst{14-12} = sh{7-5};
+  let Inst{7-6}   = sh{4-3};
 }
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm16_31:$sh))>,
-            Requires<[HasT2ExtractPack]>;
+            Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
                 (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm1_15:$sh))>,
-            Requires<[HasT2ExtractPack]>;
+            Requires<[HasT2ExtractPack, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
+                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
                           BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
-defm t2CMPz : T2I_cmp_irs<0b1101, "cmp",
-                          BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>;
+
+def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_imm:$imm),
+            (t2CMPri  GPR:$lhs, t2_so_imm:$imm)>;
+def : T2Pat<(ARMcmpZ  GPR:$lhs, rGPR:$rhs),
+            (t2CMPrr  GPR:$lhs, rGPR:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_reg:$rhs),
+            (t2CMPrs  GPR:$lhs, t2_so_reg:$rhs)>;
 
 //FIXME: Disable CMN, as CCodes are backwards from compare expectations
 //       Compare-to-zero still works out, just not the relationals
 //defm t2CMN  : T2I_cmp_irs<0b1000, "cmn",
 //                          BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
 defm t2CMNz : T2I_cmp_irs<0b1000, "cmn",
+                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
                           BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
 
 //def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
@@ -2162,18 +2695,21 @@ def : T2Pat<(ARMcmpZ  GPR:$src, t2_so_imm_neg:$imm),
             (t2CMNzri GPR:$src, t2_so_imm_neg:$imm)>;
 
 defm t2TST  : T2I_cmp_irs<0b0000, "tst",
-                          BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>>;
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
 defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
-                          BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>>;
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
 
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
-def t2MOVCCr : T2I<(outs rGPR:$dst), (ins rGPR:$false, rGPR:$true), IIC_iCMOVr,
-                   "mov", ".w\t$dst, $true",
-   [/*(set rGPR:$dst, (ARMcmov rGPR:$false, rGPR:$true, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $dst"> {
+def t2MOVCCr : T2TwoReg<
+                   (outs rGPR:$Rd), (ins rGPR:$false, rGPR:$Rm), IIC_iCMOVr,
+                   "mov", ".w\t$Rd, $Rm",
+   [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $Rd"> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2183,10 +2719,11 @@ def t2MOVCCr : T2I<(outs rGPR:$dst), (ins rGPR:$false, rGPR:$true), IIC_iCMOVr,
   let Inst{7-4} = 0b0000;
 }
 
-def t2MOVCCi : T2I<(outs rGPR:$dst), (ins rGPR:$false, t2_so_imm:$true),
-                   IIC_iCMOVi, "mov", ".w\t$dst, $true",
-[/*(set rGPR:$dst,(ARMcmov rGPR:$false,t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $dst"> {
+let isMoveImm = 1 in
+def t2MOVCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
+                   IIC_iCMOVi, "mov", ".w\t$Rd, $imm",
+[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
+                   RegConstraint<"$false = $Rd"> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b0010;
@@ -2195,9 +2732,49 @@ def t2MOVCCi : T2I<(outs rGPR:$dst), (ins rGPR:$false, t2_so_imm:$true),
   let Inst{15} = 0;
 }
 
+let isMoveImm = 1 in
+def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, i32imm_hilo16:$imm),
+                      IIC_iCMOVi,
+                      "movw", "\t$Rd, $imm", []>,
+                      RegConstraint<"$false = $Rd"> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+let isMoveImm = 1 in
+def t2MOVCCi32imm : PseudoInst<(outs rGPR:$dst),
+                               (ins rGPR:$false, i32imm:$src, pred:$p),
+                    IIC_iCMOVix2, []>, RegConstraint<"$false = $dst">;
+
+let isMoveImm = 1 in
+def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
+                   IIC_iCMOVi, "mvn", ".w\t$Rd, $imm",
+[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm,
+                   imm:$cc, CCR:$ccr))*/]>,
+                   RegConstraint<"$false = $Rd"> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b0011;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
 class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
                    string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
+  : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2205,22 +2782,22 @@ class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{19-16} = 0b1111; // Rn
   let Inst{5-4} = opcod; // Shift type.
 }
-def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$dst),
-                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
-                             IIC_iCMOVsi, "lsl", ".w\t$dst, $true, $rhs", []>,
-                 RegConstraint<"$false = $dst">;
-def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$dst),
-                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
-                             IIC_iCMOVsi, "lsr", ".w\t$dst, $true, $rhs", []>,
-                 RegConstraint<"$false = $dst">;
-def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$dst),
-                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
-                             IIC_iCMOVsi, "asr", ".w\t$dst, $true, $rhs", []>,
-                 RegConstraint<"$false = $dst">;
-def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$dst),
-                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
-                             IIC_iCMOVsi, "ror", ".w\t$dst, $true, $rhs", []>,
-                 RegConstraint<"$false = $dst">;
+def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "lsl", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
+def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "lsr", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
+def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "asr", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
+def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -2229,78 +2806,29 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$dst),
 
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
-def t2DMBsy : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "dmb", "",
-                    [(ARMMemBarrier)]>, Requires<[IsThumb, HasDB]> {
-  let Inst{31-4} = 0xF3BF8F5;
-  // FIXME: add support for options other than a full system DMB
-  let Inst{3-0} = 0b1111;
-}
-
-def t2DSBsy : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "dsb", "",
-                    [(ARMSyncBarrier)]>, Requires<[IsThumb, HasDB]> {
-  let Inst{31-4} = 0xF3BF8F4;
-  // FIXME: add support for options other than a full system DSB
-  let Inst{3-0} = 0b1111;
-}
+def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+                  "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                  Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f5;
+  let Inst{3-0} = opt;
 }
-
-// Helper class for multiclass T2MemB -- for disassembly only
-class T2I_memb<string opc, string asm>
-  : T2I<(outs), (ins), NoItinerary, opc, asm,
-        [/* For disassembly only; pattern left blank */]>,
-    Requires<[IsThumb2, HasV7]> {
-  let Inst{31-20} = 0xf3b;
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
 }
 
-multiclass T2MemB<bits<4> op7_4, string opc> {
-
-  def st : T2I_memb<opc, "\tst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b1110;
-  }
-
-  def ish : T2I_memb<opc, "\tish"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b1011;
-  }
-
-  def ishst : T2I_memb<opc, "\tishst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b1010;
-  }
-
-  def nsh : T2I_memb<opc, "\tnsh"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0111;
-  }
-
-  def nshst : T2I_memb<opc, "\tnshst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0110;
-  }
-
-  def osh : T2I_memb<opc, "\tosh"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0011;
-  }
-
-  def oshst : T2I_memb<opc, "\toshst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0010;
-  }
+def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+                  "dsb", "\t$opt",
+                  [/* For disassembly only; pattern left blank */]>,
+                  Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f4;
+  let Inst{3-0} = opt;
 }
 
-// These DMB variants are for disassembly only.
-defm t2DMB : T2MemB<0b0101, "dmb">;
-
-// These DSB variants are for disassembly only.
-defm t2DSB : T2MemB<0b0100, "dsb">;
-
 // ISB has only full system option -- for disassembly only
-def t2ISBsy : T2I_memb<"isb", ""> {
-  let Inst{7-4} = 0b0110;
+def t2ISB : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "isb", "",
+                  [/* For disassembly only; pattern left blank */]>,
+                  Requires<[IsThumb2, HasV7]> {
+  let Inst{31-4} = 0xf3bf8f6;
   let Inst{3-0} = 0b1111;
 }
 
@@ -2314,6 +2842,11 @@ class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let Inst{7-6} = 0b01;
   let Inst{5-4} = opcod;
   let Inst{3-0} = 0b1111;
+
+  bits<4> Rn;
+  bits<4> Rt;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt;
 }
 class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
@@ -2324,60 +2857,88 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let Inst{11-8} = rt2;
   let Inst{7-6} = 0b01;
   let Inst{5-4} = opcod;
+
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rt;
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt;
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone,
-                         Size4Bytes, NoItinerary, "ldrexb", "\t$dest, [$ptr]",
+def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone,
+                         Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, [$Rn]",
                          "", []>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone,
-                         Size4Bytes, NoItinerary, "ldrexh", "\t$dest, [$ptr]",
+def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone,
+                         Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, [$Rn]",
                          "", []>;
-def t2LDREX  : Thumb2I<(outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone,
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone,
                        Size4Bytes, NoItinerary,
-                       "ldrex", "\t$dest, [$ptr]", "",
+                       "ldrex", "\t$Rt, [$Rn]", "",
                       []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000101;
   let Inst{11-8} = 0b1111;
   let Inst{7-0} = 0b00000000; // imm8 = 0
+
+  bits<4> Rn;
+  bits<4> Rt;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt;
 }
-def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$dest, rGPR:$dest2), (ins rGPR:$ptr),
+def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins rGPR:$Rn),
                          AddrModeNone, Size4Bytes, NoItinerary,
-                         "ldrexd", "\t$dest, $dest2, [$ptr]", "",
-                         [], {?, ?, ?, ?}>;
+                         "ldrexd", "\t$Rt, $Rt2, [$Rn]", "",
+                         [], {?, ?, ?, ?}> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
 }
 
-let mayStore = 1, Constraints = "@earlyclobber $success" in {
-def t2STREXB : T2I_strex<0b00, (outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr),
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn),
                          AddrModeNone, Size4Bytes, NoItinerary,
-                         "strexb", "\t$success, $src, [$ptr]", "", []>;
-def t2STREXH : T2I_strex<0b01, (outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr),
+                         "strexb", "\t$Rd, $Rt, [$Rn]", "", []>;
+def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn),
                          AddrModeNone, Size4Bytes, NoItinerary,
-                         "strexh", "\t$success, $src, [$ptr]", "", []>;
-def t2STREX  : Thumb2I<(outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr),
+                         "strexh", "\t$Rd, $Rt, [$Rn]", "", []>;
+def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn),
                        AddrModeNone, Size4Bytes, NoItinerary,
-                       "strex", "\t$success, $src, [$ptr]", "",
+                       "strex", "\t$Rd, $Rt, [$Rn]", "",
                       []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000100;
   let Inst{7-0} = 0b00000000; // imm8 = 0
+
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rt;
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt;
 }
-def t2STREXD : T2I_strex<0b11, (outs rGPR:$success),
-                         (ins rGPR:$src, rGPR:$src2, rGPR:$ptr),
+def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, rGPR:$Rn),
                          AddrModeNone, Size4Bytes, NoItinerary,
-                         "strexd", "\t$success, $src, $src2, [$ptr]", "", [],
-                         {?, ?, ?, ?}>;
+                         "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]", "", [],
+                         {?, ?, ?, ?}> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
 }
 
 // Clear-Exclusive is for disassembly only.
-def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "",
-                  [/* For disassembly only; pattern left blank */]>,
-            Requires<[IsARM, HasV7]>  {
-  let Inst{31-20} = 0xf3b;
+def t2CLREX : T2XI<(outs), (ins), NoItinerary, "clrex",
+                   [/* For disassembly only; pattern left blank */]>,
+            Requires<[IsThumb2, HasV7]>  {
+  let Inst{31-16} = 0xf3bf;
   let Inst{15-14} = 0b10;
+  let Inst{13} = 0;
   let Inst{12} = 0;
+  let Inst{11-8} = 0b1111;
   let Inst{7-4} = 0b0010;
+  let Inst{3-0} = 0b1111;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2386,7 +2947,7 @@ def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "",
 
 // __aeabi_read_tp preserves the registers r1-r3.
 let isCall = 1,
-  Defs = [R0, R12, LR, CPSR] in {
+  Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
   def t2TPsoft : T2XI<(outs), (ins), IIC_Br,
                      "bl\t__aeabi_read_tp",
                      [(set R0, ARMthread_pointer)]> {
@@ -2413,32 +2974,18 @@ let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
     D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
     D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ], hasSideEffects = 1, isBarrier = 1 in {
+    D31 ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
   def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
-                               AddrModeNone, SizeSpecial, NoItinerary,
-                               "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
-                               "adds\t$val, #7\n\t"
-                               "str\t$val, [$src, #4]\n\t"
-                               "movs\tr0, #0\n\t"
-                               "b\t1f\n\t"
-                               "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
-                               "1:", "",
+                               AddrModeNone, SizeSpecial, NoItinerary, "", "",
                           [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
                              Requires<[IsThumb2, HasVFP2]>;
 }
 
 let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
-  hasSideEffects = 1, isBarrier = 1 in {
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
   def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
-                               AddrModeNone, SizeSpecial, NoItinerary,
-                               "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
-                               "adds\t$val, #7\n\t"
-                               "str\t$val, [$src, #4]\n\t"
-                               "movs\tr0, #0\n\t"
-                               "b\t1f\n\t"
-                               "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
-                               "1:", "",
+                               AddrModeNone, SizeSpecial, NoItinerary, "", "",
                           [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
                                   Requires<[IsThumb2, NoVFP]>;
 }
@@ -2453,82 +3000,77 @@ let Defs =
 // operand list.
 // FIXME: Should pc be an implicit operand like PICADD, etc?
 let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
-    hasExtraDefRegAllocReq = 1 in
-  def t2LDM_RET : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                         reglist:$dsts, variable_ops), IIC_Br,
-                        "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts",
-                        "$addr.addr = $wb", []> {
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def t2LDMIA_RET: T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                        reglist:$regs, variable_ops),
+                        IIC_iLoad_mBr,
+                        "ldmia${p}.w\t$Rn!, $regs",
+                        "$Rn = $wb", []> {
+  bits<4>  Rn;
+  bits<16> regs;
+
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b00;
-  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
-  let Inst{22} = 0;
-  let Inst{21} = 1; // The W bit.
-  let Inst{20} = 1; // Load
+  let Inst{24-23} = 0b01;     // Increment After
+  let Inst{22}    = 0;
+  let Inst{21}    = 1;        // Writeback
+  let Inst{20}    = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-0}  = regs;
 }
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
-def t2B   : T2XI<(outs), (ins brtarget:$target), IIC_Br,
+def t2B   : T2XI<(outs), (ins uncondbrtarget:$target), IIC_Br,
                  "b.w\t$target",
                  [(br bb:$target)]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 1;
+
+  bits<20> target;
+  let Inst{26} = target{19};
+  let Inst{11} = target{18};
+  let Inst{13} = target{17};
+  let Inst{21-16} = target{16-11};
+  let Inst{10-0} = target{10-0};
 }
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
-def t2BR_JT :
-    T2JTI<(outs),
-          (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id),
-           IIC_Br, "mov\tpc, $target$jt",
-          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0100100;
-  let Inst{19-16} = 0b1111;
-  let Inst{14-12} = 0b000;
-  let Inst{11-8} = 0b1111; // Rd = pc
-  let Inst{7-4} = 0b0000;
-}
+def t2BR_JT : t2PseudoInst<(outs),
+          (ins GPR:$target, GPR:$index, i32imm:$jt, i32imm:$id),
+           SizeSpecial, IIC_Br,
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>;
 
 // FIXME: Add a non-pc based case that can be predicated.
-def t2TBB :
-    T2JTI<(outs),
-        (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
-         IIC_Br, "tbb\t$index$jt", []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0001101;
-  let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction)
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-4} = 0b0000; // B form
-}
-
-def t2TBH :
-    T2JTI<(outs),
-        (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
-         IIC_Br, "tbh\t$index$jt", []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0001101;
-  let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction)
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-4} = 0b0001; // H form
-}
-
-// Generic versions of the above two instructions, for disassembly only
-
-def t2TBBgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br,
-                    "tbb", "\t[$a, $b]", []>{
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0001101;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-4} = 0b0000; // B form
-}
-
-def t2TBHgen : T2I<(outs), (ins GPR:$a, GPR:$b), IIC_Br,
-                   "tbh", "\t[$a, $b, lsl #1]", []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0001101;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-4} = 0b0001; // H form
+def t2TBB_JT : t2PseudoInst<(outs),
+        (ins GPR:$index, i32imm:$jt, i32imm:$id),
+         SizeSpecial, IIC_Br, []>;
+
+def t2TBH_JT : t2PseudoInst<(outs),
+        (ins GPR:$index, i32imm:$jt, i32imm:$id),
+         SizeSpecial, IIC_Br, []>;
+
+def t2TBB : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
+                    "tbb", "\t[$Rn, $Rm]", []> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 0; // B form
+  let Inst{3-0} = Rm;
+}
+
+def t2TBH : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
+                   "tbh", "\t[$Rn, $Rm, lsl #1]", []> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 1; // H form
+  let Inst{3-0} = Rm;
 }
 } // isNotDuplicable, isIndirectBranch
 
@@ -2543,6 +3085,16 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 0;
+
+  bits<4> p;
+  let Inst{25-22} = p;
+
+  bits<21> target;
+  let Inst{26} = target{20};
+  let Inst{11} = target{19};
+  let Inst{13} = target{18};
+  let Inst{21-16} = target{17-12};
+  let Inst{10-0} = target{11-1};
 }
 
 
@@ -2554,6 +3106,11 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
   // 16-bit instruction.
   let Inst{31-16} = 0x0000;
   let Inst{15-8} = 0b10111111;
+
+  bits<4> cc;
+  bits<4> mask;
+  let Inst{7-4} = cc;
+  let Inst{3-0} = mask;
 }
 
 // Branch and Exchange Jazelle -- for disassembly only
@@ -2565,22 +3122,44 @@ def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func",
   let Inst{25-20} = 0b111100;
   let Inst{15-14} = 0b10;
   let Inst{12} = 0;
+
+  bits<4> func;
+  let Inst{19-16} = func;
 }
 
-// Change Processor State is a system instruction -- for disassembly only.
-// The singleton $opt operand contains the following information:
-// opt{4-0} = mode from Inst{4-0}
-// opt{5} = changemode from Inst{17}
-// opt{8-6} = AIF from Inst{8-6}
-// opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable
-def t2CPS : T2XI<(outs),(ins cps_opt:$opt), NoItinerary, "cps$opt",
-                 [/* For disassembly only; pattern left blank */]> {
+// Change Processor State is a system instruction -- for disassembly and
+// parsing only.
+// FIXME: Since the asm parser has currently no clean way to handle optional
+// operands, create 3 versions of the same instruction. Once there's a clean
+// framework to represent optional operands, change this behavior.
+class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
+            !strconcat("cps", asm_op),
+            [/* For disassembly only; pattern left blank */]> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
   let Inst{31-27} = 0b11110;
-  let Inst{26} = 0;
+  let Inst{26}    = 0;
   let Inst{25-20} = 0b111010;
+  let Inst{19-16} = 0b1111;
   let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
-}
+  let Inst{12}    = 0;
+  let Inst{10-9}  = imod;
+  let Inst{8}     = M;
+  let Inst{7-5}   = iflags;
+  let Inst{4-0}   = mode;
+}
+
+let M = 1 in
+  def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+                      "$imod.w\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
+                      "$imod.w\t$iflags">;
+let imod = 0, iflags = 0, M = 1 in
+  def t2CPS1p : t2CPS<(ins i32imm:$mode), "\t$mode">;
 
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
@@ -2589,6 +3168,7 @@ class T2I_hint<bits<8> op7_0, string opc, string asm>
   : T2I<(outs), (ins), NoItinerary, opc, asm,
         [/* For disassembly only; pattern left blank */]> {
   let Inst{31-20} = 0xf3a;
+  let Inst{19-16} = 0b1111;
   let Inst{15-14} = 0b10;
   let Inst{12} = 0;
   let Inst{10-8} = 0b000;
@@ -2608,6 +3188,9 @@ def t2DBG : T2I<(outs),(ins i32imm:$opt), NoItinerary, "dbg", "\t$opt",
   let Inst{12} = 0;
   let Inst{10-8} = 0b000;
   let Inst{7-4} = 0b1111;
+
+  bits<4> opt;
+  let Inst{3-0} = opt;
 }
 
 // Secure Monitor Call is a system instruction -- for disassembly only
@@ -2617,83 +3200,86 @@ def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
   let Inst{31-27} = 0b11110;
   let Inst{26-20} = 0b1111111;
   let Inst{15-12} = 0b1000;
-}
 
-// Store Return State is a system instruction -- for disassembly only
-def t2SRSDBW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0000010; // W = 1
+  bits<4> opt;
+  let Inst{19-16} = opt;
 }
 
-def t2SRSDB  : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0000000; // W = 0
-}
+class T2SRS<bits<12> op31_20,
+           dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-20} = op31_20{11-0};
 
-def t2SRSIAW : T2I<(outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0011010; // W = 1
+  bits<5> mode;
+  let Inst{4-0} = mode{4-0};
 }
 
-def t2SRSIA  : T2I<(outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0011000; // W = 0
-}
+// Store Return State is a system instruction -- for disassembly only
+def t2SRSDBW : T2SRS<0b111010000010,
+                   (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2SRSDB  : T2SRS<0b111010000000,
+                   (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2SRSIAW : T2SRS<0b111010011010,
+                   (outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2SRSIA  : T2SRS<0b111010011000,
+                   (outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode",
+                   [/* For disassembly only; pattern left blank */]>;
 
 // Return From Exception is a system instruction -- for disassembly only
-def t2RFEDBW : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfedb", "\t$base!",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0000011; // W = 1
-}
 
-def t2RFEDB  : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeab", "\t$base",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0000001; // W = 0
-}
+class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-20} = op31_20{11-0};
 
-def t2RFEIAW : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeia", "\t$base!",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0011011; // W = 1
+  bits<4> Rn;
+  let Inst{19-16} = Rn;
 }
 
-def t2RFEIA  : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeia", "\t$base",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0011001; // W = 0
-}
+def t2RFEDBW : T2RFE<0b111010000011,
+                   (outs), (ins rGPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEDB  : T2RFE<0b111010000001,
+                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeab", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIAW : T2RFE<0b111010011011,
+                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIA  : T2RFE<0b111010011001,
+                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
 
-// Two piece so_imms.
-def : T2Pat<(or rGPR:$LHS, t2_so_imm2part:$RHS),
-             (t2ORRri (t2ORRri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
-                    (t2_so_imm2part_2 imm:$RHS))>;
-def : T2Pat<(xor rGPR:$LHS, t2_so_imm2part:$RHS),
-             (t2EORri (t2EORri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
-                    (t2_so_imm2part_2 imm:$RHS))>;
-def : T2Pat<(add rGPR:$LHS, t2_so_imm2part:$RHS),
-             (t2ADDri (t2ADDri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
-                    (t2_so_imm2part_2 imm:$RHS))>;
-def : T2Pat<(add rGPR:$LHS, t2_so_neg_imm2part:$RHS),
-             (t2SUBri (t2SUBri rGPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)),
-                    (t2_so_neg_imm2part_2 imm:$RHS))>;
-
 // 32-bit immediate using movw + movt.
-// This is a single pseudo instruction to make it re-materializable. Remove
-// when we can do generalized remat.
-let isReMaterializable = 1 in
-def t2MOVi32imm : T2Ix2<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
-                   "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
-                     [(set rGPR:$dst, (i32 imm:$src))]>;
+// This is a single pseudo instruction to make it re-materializable.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                            [(set rGPR:$dst, (i32 imm:$src))]>,
+                            Requires<[IsThumb, HasV6T2]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if pic).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2addpc,
+                          [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                          Requires<[IsThumb2, UseMovt]>;
+
+def t2MOV_ga_dyn : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+                              IIC_iMOVix2,
+                          [(set rGPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
+                          Requires<[IsThumb2, UseMovt]>;
+}
 
 // ConstantPool, GlobalAddress, and JumpTable
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>,
@@ -2709,10 +3295,9 @@ def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 // be expanded into two instructions late to allow if-conversion and
 // scheduling.
 let canFoldAsLoad = 1, isReMaterializable = 1 in
-def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
-                   NoItinerary,
-                   "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
-               [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                   IIC_iLoadiALU,
+              [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
 
@@ -2720,48 +3305,128 @@ def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
 // Move between special register and ARM core register -- for disassembly only
 //
 
-// Rd = Instr{11-8}
-def t2MRS : T2I<(outs rGPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, cpsr",
-                [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{26} = 0;
-  let Inst{25-21} = 0b11111;
-  let Inst{20} = 0; // The R bit.
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
+class T2SpecialReg<bits<12> op31_20, bits<2> op15_14, bits<1> op12,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-20} = op31_20{11-0};
+  let Inst{15-14} = op15_14{1-0};
+  let Inst{12} = op12{0};
 }
 
-// Rd = Instr{11-8}
-def t2MRSsys : T2I<(outs rGPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, spsr",
-                   [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{26} = 0;
-  let Inst{25-21} = 0b11111;
-  let Inst{20} = 1; // The R bit.
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
+class T2MRS<bits<12> op31_20, bits<2> op15_14, bits<1> op12,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : T2SpecialReg<op31_20, op15_14, op12, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{11-8} = Rd;
+  let Inst{19-16} = 0b1111;
 }
 
-// Rn = Inst{19-16}
-def t2MSR : T2I<(outs), (ins rGPR:$src, msr_mask:$mask), NoItinerary, "msr",
-                "\tcpsr$mask, $src",
-                [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{26} = 0;
-  let Inst{25-21} = 0b11100;
-  let Inst{20} = 0; // The R bit.
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
+def t2MRS : T2MRS<0b111100111110, 0b10, 0,
+                (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr",
+                [/* For disassembly only; pattern left blank */]>;
+def t2MRSsys : T2MRS<0b111100111111, 0b10, 0,
+                   (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
+                   [/* For disassembly only; pattern left blank */]>;
+
+// Move from ARM core register to Special Register
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */,
+                         0 /* op12 */, (outs), (ins msr_mask:$mask, rGPR:$Rn),
+                         NoItinerary, "msr", "\t$mask, $Rn",
+                         [/* For disassembly only; pattern left blank */]> {
+  bits<5> mask;
+  bits<4> Rn;
+  let Inst{19-16} = Rn;
+  let Inst{20}    = mask{4}; // R Bit
+  let Inst{13}    = 0b0;
+  let Inst{11-8}  = mask{3-0};
 }
 
-// Rn = Inst{19-16}
-def t2MSRsys : T2I<(outs), (ins rGPR:$src, msr_mask:$mask), NoItinerary, "msr",
-                   "\tspsr$mask, $src",
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register -- for disassembly only
+//
+
+class t2MovRCopro<string opc, bit direction>
+  : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
+                       GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+          [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-24} = 0b1110;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */>;
+def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */>;
+
+class t2MovRRCopro<string opc, bit direction>
+  : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
+          [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-24} = 0b1100;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+def t2MCRR2 : t2MovRRCopro<"mcrr2",
+                           0 /* from ARM core register to coprocessor */>;
+def t2MRRC2 : t2MovRRCopro<"mrrc2",
+                           1 /* from coprocessor to ARM core register */>;
+
+//===----------------------------------------------------------------------===//
+// Other Coprocessor Instructions.  For disassembly only.
+//
+
+def t2CDP2 : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
+                   c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+                   "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                    [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{26} = 0;
-  let Inst{25-21} = 0b11100;
-  let Inst{20} = 1; // The R bit.
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
index c29e096..920c5c9 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -1,4 +1,4 @@
-//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===//
+//===- ARMInstrVFP.td - VFP support for ARM ----------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,30 +11,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-def SDT_FTOI :
-SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
-def SDT_ITOF :
-SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
-def SDT_CMPFP0 :
-SDTypeProfile<0, 1, [SDTCisFP<0>]>;
-def SDT_VMOVDRR :
-SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
-                     SDTCisSameAs<1, 2>]>;
-
-def arm_ftoui  : SDNode<"ARMISD::FTOUI",  SDT_FTOI>;
-def arm_ftosi  : SDNode<"ARMISD::FTOSI",  SDT_FTOI>;
-def arm_sitof  : SDNode<"ARMISD::SITOF",  SDT_ITOF>;
-def arm_uitof  : SDNode<"ARMISD::UITOF",  SDT_ITOF>;
-def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInFlag,SDNPOutFlag]>;
-def arm_cmpfp  : SDNode<"ARMISD::CMPFP",  SDT_ARMCmp, [SDNPOutFlag]>;
-def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0",SDT_CMPFP0, [SDNPOutFlag]>;
-def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR",  SDT_VMOVDRR>;
+def SDT_FTOI    : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDT_ITOF    : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                                       SDTCisSameAs<1, 2>]>;
+
+def arm_ftoui  : SDNode<"ARMISD::FTOUI",   SDT_FTOI>;
+def arm_ftosi  : SDNode<"ARMISD::FTOSI",   SDT_FTOI>;
+def arm_sitof  : SDNode<"ARMISD::SITOF",   SDT_ITOF>;
+def arm_uitof  : SDNode<"ARMISD::UITOF",   SDT_ITOF>;
+def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
+def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+
 
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
 //
 
-
 def vfp_f32imm : Operand<f32>,
                  PatLeaf<(f32 fpimm), [{
       return ARM::getVFPf32Imm(N->getValueAPF()) != -1;
@@ -55,86 +51,136 @@ def vfp_f64imm : Operand<f64>,
 //
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr),
-                 IIC_fpLoad64, "vldr", ".64\t$dst, $addr",
-                 [(set DPR:$dst, (f64 (load addrmode5:$addr)))]>;
 
-def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr),
-                 IIC_fpLoad32, "vldr", ".32\t$dst, $addr",
-                 [(set SPR:$dst, (load addrmode5:$addr))]>;
-} // canFoldAsLoad
+def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
+                 IIC_fpLoad64, "vldr", ".64\t$Dd, $addr",
+                 [(set DPR:$Dd, (f64 (load addrmode5:$addr)))]>;
 
-def VSTRD  : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr),
-                 IIC_fpStore64, "vstr", ".64\t$src, $addr",
-                 [(store (f64 DPR:$src), addrmode5:$addr)]>;
+def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
+                 IIC_fpLoad32, "vldr", ".32\t$Sd, $addr",
+                 [(set SPR:$Sd, (load addrmode5:$addr))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
-def VSTRS  : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
-                 IIC_fpStore32, "vstr", ".32\t$src, $addr",
-                 [(store SPR:$src, addrmode5:$addr)]>;
+} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
 
-//===----------------------------------------------------------------------===//
-//  Load / store multiple Instructions.
-//
+def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
+                 IIC_fpStore64, "vstr", ".64\t$Dd, $addr",
+                 [(store (f64 DPR:$Dd), addrmode5:$addr)]>;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
-def VLDMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
-                           variable_ops), IndexModeNone, IIC_fpLoadm,
-                  "vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
-  let Inst{20} = 1;
+def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
+                 IIC_fpStore32, "vstr", ".32\t$Sd, $addr",
+                 [(store SPR:$Sd, addrmode5:$addr)]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
-def VLDMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
-                           variable_ops), IndexModeNone, IIC_fpLoadm,
-                  "vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
-  let Inst{20} = 1;
-}
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
 
-def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                       reglist:$dsts, variable_ops),
-                      IndexModeUpd, IIC_fpLoadm,
-                      "vldm${addr:submode}${p}\t$addr!, $dsts",
-                      "$addr.addr = $wb", []> {
-  let Inst{20} = 1;
+multiclass vfp_ldst_mult<string asm, bit L_bit,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  // Double Precision
+  def DIA :
+    AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DIA_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def DDB :
+    AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DDB_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+
+  // Single Precision
+  def SIA :
+    AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SIA_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SDB :
+    AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SDB_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
 }
 
-def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                       reglist:$dsts, variable_ops),
-                      IndexModeUpd, IIC_fpLoadm, 
-                      "vldm${addr:submode}${p}\t$addr!, $dsts",
-                      "$addr.addr = $wb", []> {
-  let Inst{20} = 1;
-}
-} // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq
+let neverHasSideEffects = 1 in {
 
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-def VSTMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
-                           variable_ops), IndexModeNone, IIC_fpStorem,
-                  "vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
-  let Inst{20} = 0;
-}
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
 
-def VSTMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
-                           variable_ops), IndexModeNone, IIC_fpStorem,
-                  "vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
-  let Inst{20} = 0;
-}
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpLoad_m, IIC_fpLoad_mu>;
 
-def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                       reglist:$srcs, variable_ops),
-                      IndexModeUpd, IIC_fpStorem,
-                      "vstm${addr:submode}${p}\t$addr!, $srcs",
-                      "$addr.addr = $wb", []> {
-  let Inst{20} = 0;
-}
+} // neverHasSideEffects
 
-def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                       reglist:$srcs, variable_ops),
-                      IndexModeUpd, IIC_fpStorem,
-                      "vstm${addr:submode}${p}\t$addr!, $srcs",
-                      "$addr.addr = $wb", []> {
-  let Inst{20} = 0;
-}
-} // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq
+def : MnemonicAlias<"vldm", "vldmia">;
+def : MnemonicAlias<"vstm", "vstmia">;
 
 // FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
 
@@ -142,56 +188,71 @@ def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
 // FP Binary Operations.
 //
 
-def VADDD  : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 IIC_fpALU64, "vadd", ".f64\t$dst, $a, $b",
-                 [(set DPR:$dst, (fadd DPR:$a, (f64 DPR:$b)))]>;
-
-def VADDS  : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                  IIC_fpALU32, "vadd", ".f32\t$dst, $a, $b",
-                  [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
-
-// These are encoded as unary instructions.
-let Defs = [FPSCR] in {
-def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$a, DPR:$b),
-                 IIC_fpCMP64, "vcmpe", ".f64\t$a, $b",
-                 [(arm_cmpfp DPR:$a, (f64 DPR:$b))]>;
-
-def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$a, DPR:$b),
-                 IIC_fpCMP64, "vcmp", ".f64\t$a, $b",
-                 [/* For disassembly only; pattern left blank */]>;
-
-def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins SPR:$a, SPR:$b),
-                 IIC_fpCMP32, "vcmpe", ".f32\t$a, $b",
-                 [(arm_cmpfp SPR:$a, SPR:$b)]>;
-
-def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins SPR:$a, SPR:$b),
-                 IIC_fpCMP32, "vcmp", ".f32\t$a, $b",
-                 [/* For disassembly only; pattern left blank */]>;
+def VADDD  : ADbI<0b11100, 0b11, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
-def VDIVD  : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 IIC_fpDIV64, "vdiv", ".f64\t$dst, $a, $b",
-                 [(set DPR:$dst, (fdiv DPR:$a, (f64 DPR:$b)))]>;
-
-def VDIVS  : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                 IIC_fpDIV32, "vdiv", ".f32\t$dst, $a, $b",
-                 [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>;
-
-def VMULD  : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 IIC_fpMUL64, "vmul", ".f64\t$dst, $a, $b",
-                 [(set DPR:$dst, (fmul DPR:$a, (f64 DPR:$b)))]>;
-
-def VMULS  : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                  IIC_fpMUL32, "vmul", ".f32\t$dst, $a, $b",
-                  [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
+def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
-def VNMULD  : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                  IIC_fpMUL64, "vnmul", ".f64\t$dst, $a, $b",
-                  [(set DPR:$dst, (fneg (fmul DPR:$a, (f64 DPR:$b))))]>;
+def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
+
+def VMULD  : ADbI<0b11100, 0b10, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
-def VNMULS  : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                  IIC_fpMUL32, "vnmul", ".f32\t$dst, $a, $b",
-                  [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]>;
+def VNMULD : ADbI<0b11100, 0b10, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>;
+
+def VNMULS : ASbI<0b11100, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 // Match reassociated forms only if not sign dependent rounding.
 def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
@@ -199,53 +260,128 @@ def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
 def : Pat<(fmul (fneg SPR:$a), SPR:$b),
           (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
 
+// These are encoded as unary instructions.
+let Defs = [FPSCR] in {
+def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+
+def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
-def VSUBD  : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
-                 IIC_fpALU64, "vsub", ".f64\t$dst, $a, $b",
-                 [(set DPR:$dst, (fsub DPR:$a, (f64 DPR:$b)))]>;
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
+                  [/* For disassembly only; pattern left blank */]>;
 
-def VSUBS  : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
-                  IIC_fpALU32, "vsub", ".f32\t$dst, $a, $b",
-                  [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]>;
+def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
+                  [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+} // Defs = [FPSCR]
 
 //===----------------------------------------------------------------------===//
 // FP Unary Operations.
 //
 
-def VABSD  : ADuI<0b11101, 0b11, 0b0000, 0b11, 0, (outs DPR:$dst), (ins DPR:$a),
-                 IIC_fpUNA64, "vabs", ".f64\t$dst, $a",
-                 [(set DPR:$dst, (fabs (f64 DPR:$a)))]>;
-
-def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,(outs SPR:$dst), (ins SPR:$a),
-                  IIC_fpUNA32, "vabs", ".f32\t$dst, $a",
-                  [(set SPR:$dst, (fabs SPR:$a))]>;
+def VABSD  : ADuI<0b11101, 0b11, 0b0000, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>;
+
+def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fabs SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 let Defs = [FPSCR] in {
-def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$a),
-                  IIC_fpCMP64, "vcmpe", ".f64\t$a, #0",
-                  [(arm_cmpfp0 (f64 DPR:$a))]>;
+def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
+                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
 
-def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins DPR:$a),
-                  IIC_fpCMP64, "vcmp", ".f64\t$a, #0",
-                  [/* For disassembly only; pattern left blank */]>;
+def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
+                   [(arm_cmpfp0 SPR:$Sd)]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
 
-def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins SPR:$a),
-                  IIC_fpCMP32, "vcmpe", ".f32\t$a, #0",
-                  [(arm_cmpfp0 SPR:$a)]>;
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
-def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins SPR:$a),
-                  IIC_fpCMP32, "vcmp", ".f32\t$a, #0",
-                  [/* For disassembly only; pattern left blank */]>;
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
 }
 
-def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTDS, "vcvt", ".f64.f32\t$dst, $a",
-                 [(set DPR:$dst, (fextend SPR:$a))]>;
+def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+} // Defs = [FPSCR]
+
+def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
+                   [(set DPR:$Dd, (fextend SPR:$Sm))]> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+}
 
 // Special case encoding: bits 11-8 is 0b1011.
-def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
-                   IIC_fpCVTSD, "vcvt", ".f32.f64\t$dst, $a",
-                   [(set SPR:$dst, (fround DPR:$a))]> {
+def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
+                    IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
+                    [(set SPR:$Sd, (fround DPR:$Dm))]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
   let Inst{27-23} = 0b11101;
   let Inst{21-16} = 0b110111;
   let Inst{11-8}  = 0b1011;
@@ -255,6 +391,7 @@ def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
 
 // Between half-precision and single-precision.  For disassembly only.
 
+// FIXME: Verify encoding after integrated assembler is working.
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
@@ -277,47 +414,94 @@ def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
-let neverHasSideEffects = 1 in {
-def VMOVD: ADuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs DPR:$dst), (ins DPR:$a),
-                 IIC_fpUNA64, "vmov", ".f64\t$dst, $a", []>;
-
-def VMOVS: ASuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
-                 IIC_fpUNA32, "vmov", ".f32\t$dst, $a", []>;
-} // neverHasSideEffects
+def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>;
+
+def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fneg SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
-def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$dst), (ins DPR:$a),
-                 IIC_fpUNA64, "vneg", ".f64\t$dst, $a",
-                 [(set DPR:$dst, (fneg (f64 DPR:$a)))]>;
+def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>;
 
-def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,(outs SPR:$dst), (ins SPR:$a),
-                  IIC_fpUNA32, "vneg", ".f32\t$dst, $a",
-                  [(set SPR:$dst, (fneg SPR:$a))]>;
+def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
+                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
 
-def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$dst), (ins DPR:$a),
-                 IIC_fpSQRT64, "vsqrt", ".f64\t$dst, $a",
-                 [(set DPR:$dst, (fsqrt (f64 DPR:$a)))]>;
+let neverHasSideEffects = 1 in {
+def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
 
-def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
-                 IIC_fpSQRT32, "vsqrt", ".f32\t$dst, $a",
-                 [(set SPR:$dst, (fsqrt SPR:$a))]>;
+def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+} // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
 // FP <-> GPR Copies.  Int <-> FP Conversions.
 //
 
-def VMOVRS : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src),
-                 IIC_fpMOVSI, "vmov", "\t$dst, $src",
-                 [(set GPR:$dst, (bitconvert SPR:$src))]>;
+def VMOVRS : AVConv2I<0b11100001, 0b1010,
+                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
+                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]> {
+  // Instruction operands.
+  bits<4> Rt;
+  bits<5> Sn;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
 
-def VMOVSR : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src),
-                 IIC_fpMOVIS, "vmov", "\t$dst, $src",
-                 [(set SPR:$dst, (bitconvert GPR:$src))]>;
+def VMOVSR : AVConv4I<0b11100000, 0b1010,
+                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
+                      [(set SPR:$Sn, (bitconvert GPR:$Rt))]> {
+  // Instruction operands.
+  bits<5> Sn;
+  bits<4> Rt;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
 
 let neverHasSideEffects = 1 in {
 def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
-                      (outs GPR:$wb, GPR:$dst2), (ins DPR:$src),
-                 IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src",
+                        (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
+                        IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
                  [/* FIXME: Can't write pattern for multiple result instr*/]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
   let Inst{7-6} = 0b00;
 }
 
@@ -333,10 +517,21 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
 // FMDLR: GPR -> SPR
 
 def VMOVDRR : AVConv5I<0b11000100, 0b1011,
-                     (outs DPR:$dst), (ins GPR:$src1, GPR:$src2),
-                IIC_fpMOVID, "vmov", "\t$dst, $src1, $src2",
-                [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]> {
-  let Inst{7-6} = 0b00;
+                      (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
+                      IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
+                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6}   = 0b00;
 }
 
 let neverHasSideEffects = 1 in
@@ -350,102 +545,183 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
 // FMRDH: SPR -> GPR
 // FMRDL: SPR -> GPR
 // FMRRS: SPR -> GPR
-// FMRX : SPR system reg -> GPR
-
+// FMRX:  SPR system reg -> GPR
 // FMSRR: GPR -> SPR
+// FMXR:  GPR -> VFP system reg
+
+
+// Int -> FP:
+
+class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+}
 
-// FMXR: GPR -> VFP Sstem reg
-
-
-// Int to FP:
+class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,InstrItinClass itin,
+                         string opc, string asm, list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
 
-def VSITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011,
-                 (outs DPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a",
-                 [(set DPR:$dst, (f64 (arm_sitof SPR:$a)))]> {
+def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
+                               [(set DPR:$Dd, (f64 (arm_sitof SPR:$Sm)))]> {
   let Inst{7} = 1; // s32
 }
 
-def VSITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010,
-                 (outs SPR:$dst),(ins SPR:$a),
-                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a",
-                 [(set SPR:$dst, (arm_sitof SPR:$a))]> {
+def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd),(ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
+                                [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> {
   let Inst{7} = 1; // s32
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
-def VUITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011,
-                 (outs DPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a",
-                 [(set DPR:$dst, (f64 (arm_uitof SPR:$a)))]> {
+def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
+                               [(set DPR:$Dd, (f64 (arm_uitof SPR:$Sm)))]> {
   let Inst{7} = 0; // u32
 }
 
-def VUITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010,
-                 (outs SPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a",
-                 [(set SPR:$dst, (arm_uitof SPR:$a))]> {
+def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
+                                [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> {
   let Inst{7} = 0; // u32
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
-// FP to Int:
-// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+// FP -> Int:
+
+class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,
+                         InstrItinClass itin, string opc, string asm,
+                         list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
 
-def VTOSIZD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011,
-                       (outs SPR:$dst), (ins DPR:$a),
-                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a",
-                 [(set SPR:$dst, (arm_ftosi (f64 DPR:$a)))]> {
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (arm_ftosi (f64 DPR:$Dm)))]> {
   let Inst{7} = 1; // Z bit
 }
 
-def VTOSIZS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010,
-                        (outs SPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a",
-                 [(set SPR:$dst, (arm_ftosi SPR:$a))]> {
+def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> {
   let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
-def VTOUIZD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011,
-                       (outs SPR:$dst), (ins DPR:$a),
-                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a",
-                 [(set SPR:$dst, (arm_ftoui (f64 DPR:$a)))]> {
+def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                               (outs SPR:$Sd), (ins DPR:$Dm),
+                               IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
+                               [(set SPR:$Sd, (arm_ftoui (f64 DPR:$Dm)))]> {
   let Inst{7} = 1; // Z bit
 }
 
-def VTOUIZS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010,
-                        (outs SPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a",
-                 [(set SPR:$dst, (arm_ftoui SPR:$a))]> {
+def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> {
   let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
-// For disassembly only.
 let Uses = [FPSCR] in {
-def VTOSIRD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011,
-                       (outs SPR:$dst), (ins DPR:$a),
-                 IIC_fpCVTDI, "vcvtr", ".s32.f64\t$dst, $a",
-                 [(set SPR:$dst, (int_arm_vcvtr (f64 DPR:$a)))]> {
+// FIXME: Verify encoding after integrated assembler is working.
+def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{
   let Inst{7} = 0; // Z bit
 }
 
-def VTOSIRS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010,
-                        (outs SPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTSI, "vcvtr", ".s32.f32\t$dst, $a",
-                 [(set SPR:$dst, (int_arm_vcvtr SPR:$a))]> {
+def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> {
   let Inst{7} = 0; // Z bit
 }
 
-def VTOUIRD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011,
-                       (outs SPR:$dst), (ins DPR:$a),
-                 IIC_fpCVTDI, "vcvtr", ".u32.f64\t$dst, $a",
-                 [(set SPR:$dst, (int_arm_vcvtru (f64 DPR:$a)))]> {
+def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{
   let Inst{7} = 0; // Z bit
 }
 
-def VTOUIRS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010,
-                        (outs SPR:$dst), (ins SPR:$a),
-                 IIC_fpCVTSI, "vcvtr", ".u32.f32\t$dst, $a",
-                 [(set SPR:$dst, (int_arm_vcvtru SPR:$a))]> {
+def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
   let Inst{7} = 0; // Z bit
 }
 }
@@ -457,30 +733,47 @@ def VTOUIRS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010,
 //   S32 (U=0, sx=1) -> SL
 //   U32 (U=1, sx=1) -> UL
 
-let Constraints = "$a = $dst" in {
+// FIXME: Marking these as codegen only seems wrong. They are real
+//        instructions(?)
+let Constraints = "$a = $dst", isCodeGenOnly = 1 in {
 
 // FP to Fixed-Point:
 
-let isCodeGenOnly = 1 in {
 def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VTOSHD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits),
@@ -501,30 +794,44 @@ def VTOULD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits),
                  IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]>;
-}
 
 // Fixed-Point to FP:
 
-let isCodeGenOnly = 1 in {
 def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
 def VSHTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits),
@@ -545,70 +852,120 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits),
                  IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]>;
-}
 
-} // End of 'let Constraints = "$src = $dst" in'
+} // End of 'let Constraints = "$a = $dst", isCodeGenOnly = 1 in'
 
 //===----------------------------------------------------------------------===//
 // FP FMA Operations.
 //
 
-def VMLAD : ADbI_vmlX<0b11100, 0b00, 0, 0,
-                (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b",
-                [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b),
-                                      (f64 DPR:$dstin)))]>,
-                RegConstraint<"$dstin = $dst">;
+def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,UseFPVMLx]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
-                 (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                 IIC_fpMAC32, "vmla", ".f32\t$dst, $a, $b",
-                 [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
-                 RegConstraint<"$dstin = $dst">;
-
-def VNMLSD : ADbI_vmlX<0b11100, 0b01, 0, 0,
-                (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b",
-                [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b),
-                                (f64 DPR:$dstin)))]>,
-                RegConstraint<"$dstin = $dst">;
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 
-def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
-                (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                IIC_fpMAC32, "vnmls", ".f32\t$dst, $a, $b",
-                [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
-                RegConstraint<"$dstin = $dst">;
-
-def VMLSD : ADbI_vmlX<0b11100, 0b00, 1, 0,
-                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                 IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b",
-             [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)),
-                             (f64 DPR:$dstin)))]>,
-                RegConstraint<"$dstin = $dst">;
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
+
+def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,UseFPVMLx]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
-                  (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                  IIC_fpMAC32, "vmls", ".f32\t$dst, $a, $b",
-             [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
-                RegConstraint<"$dstin = $dst">;
-
-def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))),
-          (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, Requires<[DontUseNEONForFP]>;
-def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
-          (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>;
-
-def VNMLAD : ADbI_vmlX<0b11100, 0b01, 1, 0,
-                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
-                 IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b",
-             [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)),
-                             (f64 DPR:$dstin)))]>,
-                RegConstraint<"$dstin = $dst">;
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
+
+def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+                RegConstraint<"$Ddin = $Dd">,
+                Requires<[HasVFP2,UseFPVMLx]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
-                (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
-                IIC_fpMAC32, "vnmla", ".f32\t$dst, $a, $b",
-             [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
-                RegConstraint<"$dstin = $dst">;
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+          (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+          (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
+
+def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                           (f64 DPR:$Ddin)))]>,
+               RegConstraint<"$Ddin = $Dd">,
+               Requires<[HasVFP2,UseFPVMLx]>;
+
+def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
+             [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+                         RegConstraint<"$Sdin = $Sd">,
+                  Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+          (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+          (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
+
 
 //===----------------------------------------------------------------------===//
 // FP Conditional moves.
@@ -616,92 +973,157 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
 
 let neverHasSideEffects = 1 in {
 def VMOVDcc  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
-                    (outs DPR:$dst), (ins DPR:$false, DPR:$true),
-                    IIC_fpUNA64, "vmov", ".f64\t$dst, $true",
-                [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>,
-                    RegConstraint<"$false = $dst">;
+                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                    IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm",
+                    [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
+                 RegConstraint<"$Dn = $Dd">;
 
 def VMOVScc  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
-                    (outs SPR:$dst), (ins SPR:$false, SPR:$true),
-                    IIC_fpUNA32, "vmov", ".f32\t$dst, $true",
-                [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>,
-                    RegConstraint<"$false = $dst">;
+                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                    IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm",
+                    [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
+                 RegConstraint<"$Sn = $Sd">;
 
 def VNEGDcc  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
-                    (outs DPR:$dst), (ins DPR:$false, DPR:$true),
-                    IIC_fpUNA64, "vneg", ".f64\t$dst, $true",
-                [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>,
-                    RegConstraint<"$false = $dst">;
+                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                    IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
+                    [/*(set DPR:$Dd, (ARMcneg DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
+                 RegConstraint<"$Dn = $Dd">;
 
 def VNEGScc  : ASuI<0b11101, 0b11, 0b0001, 0b01, 0,
-                    (outs SPR:$dst), (ins SPR:$false, SPR:$true),
-                    IIC_fpUNA32, "vneg", ".f32\t$dst, $true",
-                [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>,
-                    RegConstraint<"$false = $dst">;
+                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                    IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
+                    [/*(set SPR:$Sd, (ARMcneg SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
+                 RegConstraint<"$Sn = $Sd"> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
-// Misc.
+// Move from VFP System Register to ARM core register.
 //
 
-// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
-// to APSR.
-let Defs = [CPSR], Uses = [FPSCR] in
-def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs",
-                   "\tapsr_nzcv, fpscr",
-             [(arm_fmstat)]> {
+class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+                 list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> Rt;
+
   let Inst{27-20} = 0b11101111;
-  let Inst{19-16} = 0b0001;
-  let Inst{15-12} = 0b1111;
+  let Inst{19-16} = opc19_16;
+  let Inst{15-12} = Rt;
   let Inst{11-8}  = 0b1010;
   let Inst{7}     = 0;
+  let Inst{6-5}   = 0b00;
   let Inst{4}     = 1;
+  let Inst{3-0}   = 0b0000;
 }
 
-// FPSCR <-> GPR (for disassembly only)
+// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
+// to APSR.
+let Defs = [CPSR], Uses = [FPSCR], Rt = 0b1111 /* apsr_nzcv */ in
+def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
+                        "vmrs", "\tapsr_nzcv, fpscr", [(arm_fmstat)]>;
+
+// Application level FPSCR -> GPR
 let hasSideEffects = 1, Uses = [FPSCR] in
-def VMRS : VFPAI<(outs GPR:$dst), (ins), VFPMiscFrm, IIC_fpSTAT,
-                 "vmrs", "\t$dst, fpscr",
-             [(set GPR:$dst, (int_arm_get_fpscr))]> {
-  let Inst{27-20} = 0b11101111;
-  let Inst{19-16} = 0b0001;
-  let Inst{11-8}  = 0b1010;
-  let Inst{7}     = 0;
-  let Inst{4}     = 1;
+def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins),
+                      "vmrs", "\t$Rt, fpscr",
+                      [(set GPR:$Rt, (int_arm_get_fpscr))]>;
+
+// System level FPEXC, FPSID -> GPR
+let Uses = [FPSCR] in {
+  def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpexc", []>;
+  def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpsid", []>;
 }
 
-let Defs = [FPSCR] in 
-def VMSR : VFPAI<(outs), (ins GPR:$src), VFPMiscFrm, IIC_fpSTAT, 
-                 "vmsr", "\tfpscr, $src",
-             [(int_arm_set_fpscr GPR:$src)]> {
+//===----------------------------------------------------------------------===//
+// Move from ARM core register to VFP System Register.
+//
+
+class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+               list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> src;
+
+  // Encode instruction operand.
+  let Inst{15-12} = src;
+
   let Inst{27-20} = 0b11101110;
-  let Inst{19-16} = 0b0001;
+  let Inst{19-16} = opc19_16;
   let Inst{11-8}  = 0b1010;
   let Inst{7}     = 0;
   let Inst{4}     = 1;
 }
 
+let Defs = [FPSCR] in {
+  // Application level GPR -> FPSCR
+  def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>;
+  // System level GPR -> FPEXC
+  def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpexc, $src", []>;
+  // System level GPR -> FPSID
+  def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpsid, $src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc.
+//
+
 // Materialize FP immediates. VFP3 only.
 let isReMaterializable = 1 in {
-def FCONSTD : VFPAI<(outs DPR:$dst), (ins vfp_f64imm:$imm),
+def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
                     VFPMiscFrm, IIC_fpUNA64,
-                    "vmov", ".f64\t$dst, $imm",
-                    [(set DPR:$dst, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
+                    "vmov", ".f64\t$Dd, $imm",
+                    [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
+  // Instruction operands.
+  bits<5>  Dd;
+  bits<32> imm;
+
+  // Encode instruction operands.
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+  let Inst{19}    = imm{31};
+  let Inst{18-16} = imm{22-20};
+  let Inst{3-0}   = imm{19-16};
+
+  // Encode remaining instruction bits.
   let Inst{27-23} = 0b11101;
   let Inst{21-20} = 0b11;
   let Inst{11-9}  = 0b101;
-  let Inst{8}     = 1;
+  let Inst{8}     = 1;          // Double precision.
   let Inst{7-4}   = 0b0000;
 }
 
-def FCONSTS : VFPAI<(outs SPR:$dst), (ins vfp_f32imm:$imm),
-                    VFPMiscFrm, IIC_fpUNA32,
-                    "vmov", ".f32\t$dst, $imm",
-                    [(set SPR:$dst, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
+def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
+                     VFPMiscFrm, IIC_fpUNA32,
+                     "vmov", ".f32\t$Sd, $imm",
+                     [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<32> imm;
+
+  // Encode instruction operands.
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+  let Inst{19}    = imm{31};    // The immediate is handled as a double.
+  let Inst{18-16} = imm{22-20};
+  let Inst{3-0}   = imm{19-16};
+
+  // Encode remaining instruction bits.
   let Inst{27-23} = 0b11101;
   let Inst{21-20} = 0b11;
   let Inst{11-9}  = 0b101;
-  let Inst{8}     = 0;
+  let Inst{8}     = 0;          // Single precision.
   let Inst{7-4}   = 0b0000;
 }
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp
index 5f6d7ee..45b7e48 100644
--- a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/System/Memory.h"
+#include "llvm/Support/Memory.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -43,7 +43,7 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 #define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
 
 // CompilationCallback stub - We can't use a C function with inline assembly in
-// it, because we the prolog/epilog inserted by GCC won't work for us (we need
+// it, because the prolog/epilog inserted by GCC won't work for us. (We need
 // to preserve more context and manipulate the stack directly).  Instead,
 // write our own wrapper, which does things our way, so we have complete
 // control over register saving and restoring.
@@ -97,9 +97,10 @@ extern "C" {
     "str  r0, [sp,#16]\n"
     // Return to the (newly modified) stub to invoke the real function.
     // The above twiddling of the saved return addresses allows us to
-    // deallocate everything, including the LR the stub saved, all in one
-    // pop instruction.
-    "ldmia  sp!, {r0, r1, r2, r3, lr, pc}\n"
+    // deallocate everything, including the LR the stub saved, with two
+    // updating load instructions.
+    "ldmia  sp!, {r0, r1, r2, r3, lr}\n"
+    "ldr    pc, [sp], #4\n"
       );
 #else  // Not an ARM host
   void ARMCompilationCallback() {
@@ -290,7 +291,7 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
       *((intptr_t*)RelocPos) |= ResultPtr;
       // Set register Rn to PC.
       *((intptr_t*)RelocPos) |=
-        ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+        getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
       break;
     }
     case ARM::reloc_arm_pic_jt:
diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h
index f5d9eff..2f97928 100644
--- a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h
@@ -105,7 +105,7 @@ namespace llvm {
     /// model is PIC.
     void Initialize(const MachineFunction &MF, bool isPIC) {
       const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-      ConstPoolId2AddrMap.resize(AFI->getNumConstPoolEntries());
+      ConstPoolId2AddrMap.resize(AFI->getNumPICLabels());
       JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
       IsPIC = isPIC;
     }
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 2b7645a..d9dc5cd 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -128,45 +128,153 @@ namespace {
   char ARMLoadStoreOpt::ID = 0;
 }
 
-static int getLoadStoreMultipleOpcode(int Opcode) {
+static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
   switch (Opcode) {
-  case ARM::LDR:
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDRi12:
     ++NumLDMGened;
-    return ARM::LDM;
-  case ARM::STR:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA;
+    case ARM_AM::da: return ARM::LDMDA;
+    case ARM_AM::db: return ARM::LDMDB;
+    case ARM_AM::ib: return ARM::LDMIB;
+    }
+    break;
+  case ARM::STRi12:
     ++NumSTMGened;
-    return ARM::STM;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA;
+    case ARM_AM::da: return ARM::STMDA;
+    case ARM_AM::db: return ARM::STMDB;
+    case ARM_AM::ib: return ARM::STMIB;
+    }
+    break;
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
     ++NumLDMGened;
-    return ARM::t2LDM;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA;
+    case ARM_AM::db: return ARM::t2LDMDB;
+    }
+    break;
   case ARM::t2STRi8:
   case ARM::t2STRi12:
     ++NumSTMGened;
-    return ARM::t2STM;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA;
+    case ARM_AM::db: return ARM::t2STMDB;
+    }
+    break;
   case ARM::VLDRS:
     ++NumVLDMGened;
-    return ARM::VLDMS;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA;
+    case ARM_AM::db: return ARM::VLDMSDB;
+    }
+    break;
   case ARM::VSTRS:
     ++NumVSTMGened;
-    return ARM::VSTMS;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA;
+    case ARM_AM::db: return ARM::VSTMSDB;
+    }
+    break;
   case ARM::VLDRD:
     ++NumVLDMGened;
-    return ARM::VLDMD;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA;
+    case ARM_AM::db: return ARM::VLDMDDB;
+    }
+    break;
   case ARM::VSTRD:
     ++NumVSTMGened;
-    return ARM::VSTMD;
-  default: llvm_unreachable("Unhandled opcode!");
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA;
+    case ARM_AM::db: return ARM::VSTMDDB;
+    }
+    break;
   }
+
   return 0;
 }
 
+namespace llvm {
+  namespace ARM_AM {
+
+AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMIA_UPD:
+  case ARM::STMIA:
+  case ARM::STMIA_UPD:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMIA_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+    return ARM_AM::ia;
+
+  case ARM::LDMDA:
+  case ARM::LDMDA_UPD:
+  case ARM::STMDA:
+  case ARM::STMDA_UPD:
+    return ARM_AM::da;
+
+  case ARM::LDMDB:
+  case ARM::LDMDB_UPD:
+  case ARM::STMDB:
+  case ARM::STMDB_UPD:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMDB:
+  case ARM::t2STMDB_UPD:
+  case ARM::VLDMSDB:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMSDB:
+  case ARM::VSTMSDB_UPD:
+  case ARM::VLDMDDB:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VSTMDDB:
+  case ARM::VSTMDDB_UPD:
+    return ARM_AM::db;
+
+  case ARM::LDMIB:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIB:
+  case ARM::STMIB_UPD:
+    return ARM_AM::ib;
+  }
+
+  return ARM_AM::bad_am_submode;
+}
+
+  } // end namespace ARM_AM
+} // end namespace llvm
+
 static bool isT2i32Load(unsigned Opc) {
   return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
 }
 
 static bool isi32Load(unsigned Opc) {
-  return Opc == ARM::LDR || isT2i32Load(Opc);
+  return Opc == ARM::LDRi12 || isT2i32Load(Opc);
 }
 
 static bool isT2i32Store(unsigned Opc) {
@@ -174,7 +282,7 @@ static bool isT2i32Store(unsigned Opc) {
 }
 
 static bool isi32Store(unsigned Opc) {
-  return Opc == ARM::STR || isT2i32Store(Opc);
+  return Opc == ARM::STRi12 || isT2i32Store(Opc);
 }
 
 /// MergeOps - Create and insert a LDM or STM with Base as base register and
@@ -245,10 +353,10 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
   bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VLDRD);
-  Opcode = getLoadStoreMultipleOpcode(Opcode);
+  Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
     .addReg(Base, getKillRegState(BaseKill))
-    .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg);
+    .addImm(Pred).addReg(PredReg);
   for (unsigned i = 0; i != NumRegs; ++i)
     MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
                      | getKillRegState(Regs[i].second));
@@ -271,22 +379,14 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
   // First calculate which of the registers should be killed by the merged
   // instruction.
   const unsigned insertPos = memOps[insertAfter].Position;
-
-  SmallSet<unsigned, 4> UnavailRegs;
   SmallSet<unsigned, 4> KilledRegs;
   DenseMap<unsigned, unsigned> Killer;
-  for (unsigned i = 0; i < memOpsBegin; ++i) {
-    if (memOps[i].Position < insertPos && memOps[i].isKill) {
-      unsigned Reg = memOps[i].Reg;
-      if (memOps[i].Merged)
-        UnavailRegs.insert(Reg);
-      else {
-        KilledRegs.insert(Reg);
-        Killer[Reg] = i;
-      }
+  for (unsigned i = 0, e = memOps.size(); i != e; ++i) {
+    if (i == memOpsBegin) {
+      i = memOpsEnd;
+      if (i == e)
+        break;
     }
-  }
-  for (unsigned i = memOpsEnd, e = memOps.size(); i != e; ++i) {
     if (memOps[i].Position < insertPos && memOps[i].isKill) {
       unsigned Reg = memOps[i].Reg;
       KilledRegs.insert(Reg);
@@ -297,12 +397,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
   SmallVector<std::pair<unsigned, bool>, 8> Regs;
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     unsigned Reg = memOps[i].Reg;
-    if (UnavailRegs.count(Reg))
-      // Register is killed before and it's not easy / possible to update the
-      // kill marker on already merged instructions. Abort.
-      return;
-
-    // If we are inserting the merged operation after an unmerged operation that
+    // If we are inserting the merged operation after an operation that
     // uses the same register, make sure to transfer any kill flag.
     bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
     Regs.push_back(std::make_pair(Reg, isKill));
@@ -318,17 +413,24 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
   // Merge succeeded, update records.
   Merges.push_back(prior(Loc));
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    // Remove kill flags from any unmerged memops that come before insertPos.
+    // Remove kill flags from any memops that come before insertPos.
     if (Regs[i-memOpsBegin].second) {
       unsigned Reg = Regs[i-memOpsBegin].first;
       if (KilledRegs.count(Reg)) {
         unsigned j = Killer[Reg];
-        memOps[j].MBBI->getOperand(0).setIsKill(false);
+        int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true);
+        assert(Idx >= 0 && "Cannot find killing operand");
+        memOps[j].MBBI->getOperand(Idx).setIsKill(false);
         memOps[j].isKill = false;
       }
+      memOps[i].isKill = true;
     }
     MBB.erase(memOps[i].MBBI);
+    // Update this memop to refer to the merged instruction.
+    // We may need to move kill flags again.
     memOps[i].Merged = true;
+    memOps[i].MBBI = Merges.back();
+    memOps[i].Position = insertPos;
   }
 }
 
@@ -349,7 +451,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   const MachineOperand &PMO = Loc->getOperand(0);
   unsigned PReg = PMO.getReg();
   unsigned PRegNum = PMO.isUndef() ? UINT_MAX
-    : ARMRegisterInfo::getRegisterNumbering(PReg);
+    : getARMRegisterNumbering(PReg);
   unsigned Count = 1;
 
   for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
@@ -357,7 +459,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
     unsigned Reg = MO.getReg();
     unsigned RegNum = MO.isUndef() ? UINT_MAX
-      : ARMRegisterInfo::getRegisterNumbering(Reg);
+      : getARMRegisterNumbering(Reg);
     // Register numbers must be in ascending order.  For VFP, the registers
     // must also be consecutive and there is a limit of 16 double-word
     // registers per instruction.
@@ -440,8 +542,8 @@ static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return 0;
-  case ARM::LDR:
-  case ARM::STR:
+  case ARM::LDRi12:
+  case ARM::STRi12:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -452,31 +554,109 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   case ARM::VLDRD:
   case ARM::VSTRD:
     return 8;
-  case ARM::LDM:
-  case ARM::STM:
-  case ARM::t2LDM:
-  case ARM::t2STM:
-  case ARM::VLDMS:
-  case ARM::VSTMS:
-    return (MI->getNumOperands() - 4) * 4;
-  case ARM::VLDMD:
-  case ARM::VSTMD:
-    return (MI->getNumOperands() - 4) * 8;
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSDB:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSDB:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+  case ARM::VLDMDIA:
+  case ARM::VLDMDDB:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDDB:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
   }
 }
 
-static unsigned getUpdatingLSMultipleOpcode(unsigned Opc) {
+static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
+                                            ARM_AM::AMSubMode Mode) {
   switch (Opc) {
-  case ARM::LDM: return ARM::LDM_UPD;
-  case ARM::STM: return ARM::STM_UPD;
-  case ARM::t2LDM: return ARM::t2LDM_UPD;
-  case ARM::t2STM: return ARM::t2STM_UPD;
-  case ARM::VLDMS: return ARM::VLDMS_UPD;
-  case ARM::VLDMD: return ARM::VLDMD_UPD;
-  case ARM::VSTMS: return ARM::VSTMS_UPD;
-  case ARM::VSTMD: return ARM::VSTMD_UPD;
   default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA_UPD;
+    case ARM_AM::ib: return ARM::LDMIB_UPD;
+    case ARM_AM::da: return ARM::LDMDA_UPD;
+    case ARM_AM::db: return ARM::LDMDB_UPD;
+    }
+    break;
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA_UPD;
+    case ARM_AM::ib: return ARM::STMIB_UPD;
+    case ARM_AM::da: return ARM::STMDA_UPD;
+    case ARM_AM::db: return ARM::STMDB_UPD;
+    }
+    break;
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA_UPD;
+    case ARM_AM::db: return ARM::t2LDMDB_UPD;
+    }
+    break;
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA_UPD;
+    case ARM_AM::db: return ARM::t2STMDB_UPD;
+    }
+    break;
+  case ARM::VLDMSIA:
+  case ARM::VLDMSDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA_UPD;
+    case ARM_AM::db: return ARM::VLDMSDB_UPD;
+    }
+    break;
+  case ARM::VLDMDIA:
+  case ARM::VLDMDDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA_UPD;
+    case ARM_AM::db: return ARM::VLDMDDB_UPD;
+    }
+    break;
+  case ARM::VSTMSIA:
+  case ARM::VSTMSDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA_UPD;
+    case ARM_AM::db: return ARM::VSTMSDB_UPD;
+    }
+    break;
+  case ARM::VSTMDIA:
+  case ARM::VSTMDDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA_UPD;
+    case ARM_AM::db: return ARM::VSTMDDB_UPD;
+    }
+    break;
   }
+
   return 0;
 }
 
@@ -505,16 +685,14 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   int Opcode = MI->getOpcode();
   DebugLoc dl = MI->getDebugLoc();
 
-  bool DoMerge = false;
-  ARM_AM::AMSubMode Mode = ARM_AM::ia;
-
   // Can't use an updating ld/st if the base register is also a dest
   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
-  for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
+  for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
     if (MI->getOperand(i).getReg() == Base)
       return false;
-  }
-  Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
+
+  bool DoMerge = false;
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode);
 
   // Try merging with the previous instruction.
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
@@ -560,15 +738,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   if (!DoMerge)
     return false;
 
-  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode);
+  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
     .addReg(Base, getDefRegState(true)) // WB base register
     .addReg(Base, getKillRegState(BaseKill))
-    .addImm(ARM_AM::getAM4ModeImm(Mode))
     .addImm(Pred).addReg(PredReg);
+
   // Transfer the rest of operands.
-  for (unsigned OpNum = 4, e = MI->getNumOperands(); OpNum != e; ++OpNum)
+  for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
     MIB.addOperand(MI->getOperand(OpNum));
+
   // Transfer memoperands.
   (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
@@ -576,14 +755,21 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   return true;
 }
 
-static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
+                                             ARM_AM::AddrOpc Mode) {
   switch (Opc) {
-  case ARM::LDR: return ARM::LDR_PRE;
-  case ARM::STR: return ARM::STR_PRE;
-  case ARM::VLDRS: return ARM::VLDMS_UPD;
-  case ARM::VLDRD: return ARM::VLDMD_UPD;
-  case ARM::VSTRS: return ARM::VSTMS_UPD;
-  case ARM::VSTRD: return ARM::VSTMD_UPD;
+  case ARM::LDRi12:
+    return ARM::LDR_PRE;
+  case ARM::STRi12:
+    return ARM::STR_PRE;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
     return ARM::t2LDR_PRE;
@@ -595,14 +781,21 @@ static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
   return 0;
 }
 
-static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
+                                              ARM_AM::AddrOpc Mode) {
   switch (Opc) {
-  case ARM::LDR: return ARM::LDR_POST;
-  case ARM::STR: return ARM::STR_POST;
-  case ARM::VLDRS: return ARM::VLDMS_UPD;
-  case ARM::VLDRD: return ARM::VLDMD_UPD;
-  case ARM::VSTRS: return ARM::VSTMS_UPD;
-  case ARM::VSTRD: return ARM::VSTMD_UPD;
+  case ARM::LDRi12:
+    return ARM::LDR_POST;
+  case ARM::STRi12:
+    return ARM::STR_POST;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
     return ARM::t2LDR_POST;
@@ -629,14 +822,12 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   DebugLoc dl = MI->getDebugLoc();
   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
-  bool isAM2 = (Opcode == ARM::LDR || Opcode == ARM::STR);
-  if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
-    return false;
-  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
-    return false;
-  if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
+  bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
+  if (isi32Load(Opcode) || isi32Store(Opcode))
     if (MI->getOperand(2).getImm() != 0)
       return false;
+  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+    return false;
 
   bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
   // Can't do the merge if the destination register is the same as the would-be
@@ -666,7 +857,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
       DoMerge = true;
     }
     if (DoMerge) {
-      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
       MBB.erase(PrevMBBI);
     }
   }
@@ -685,7 +876,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
       DoMerge = true;
     }
     if (DoMerge) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
       if (NextMBBI == I) {
         Advance = true;
         ++I;
@@ -698,12 +889,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     return false;
 
   unsigned Offset = 0;
-  if (isAM5)
-    Offset = ARM_AM::getAM4ModeImm(AddSub == ARM_AM::sub ?
-                                   ARM_AM::db : ARM_AM::ia);
-  else if (isAM2)
+  if (isAM2)
     Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
-  else
+  else if (!isAM5)
     Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
 
   if (isAM5) {
@@ -715,7 +903,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
       .addReg(Base, getDefRegState(true)) // WB base register
       .addReg(Base, getKillRegState(isLd ? BaseKill : false))
-      .addImm(Offset)
       .addImm(Pred).addReg(PredReg)
       .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
                             getKillRegState(MO.isKill())));
@@ -782,15 +969,14 @@ static bool isMemoryOp(const MachineInstr *MI) {
   int Opcode = MI->getOpcode();
   switch (Opcode) {
   default: break;
-  case ARM::LDR:
-  case ARM::STR:
-    return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
   case ARM::VLDRS:
   case ARM::VSTRS:
     return MI->getOperand(1).isReg();
   case ARM::VLDRD:
   case ARM::VSTRD:
     return MI->getOperand(1).isReg();
+  case ARM::LDRi12:
+  case ARM::STRi12:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -818,24 +1004,19 @@ void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
 
 static int getMemoryOpOffset(const MachineInstr *MI) {
   int Opcode = MI->getOpcode();
-  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
   bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
   unsigned NumOperands = MI->getDesc().getNumOperands();
   unsigned OffField = MI->getOperand(NumOperands-3).getImm();
 
   if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
       Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
-      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
+      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
     return OffField;
 
-  int Offset = isAM2
-    ? ARM_AM::getAM2Offset(OffField)
-    : (isAM3 ? ARM_AM::getAM3Offset(OffField)
-             : ARM_AM::getAM5Offset(OffField) * 4);
-  if (isAM2) {
-    if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
-      Offset = -Offset;
-  } else if (isAM3) {
+  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
+    : ARM_AM::getAM5Offset(OffField) * 4;
+  if (isAM3) {
     if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
       Offset = -Offset;
   } else {
@@ -847,35 +1028,24 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
 
 static void InsertLDR_STR(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI,
-                          int OffImm, bool isDef,
+                          int Offset, bool isDef,
                           DebugLoc dl, unsigned NewOpc,
                           unsigned Reg, bool RegDeadKill, bool RegUndef,
                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
-                          unsigned OffReg, bool OffKill, bool OffUndef,
+                          bool OffKill, bool OffUndef,
                           ARMCC::CondCodes Pred, unsigned PredReg,
                           const TargetInstrInfo *TII, bool isT2) {
-  int Offset = OffImm;
-  if (!isT2) {
-    if (OffImm < 0)
-      Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
-    else
-      Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
-  }
   if (isDef) {
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
                                       TII->get(NewOpc))
       .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
-    if (!isT2)
-      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
   } else {
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
                                       TII->get(NewOpc))
       .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
       .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
-    if (!isT2)
-      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
     MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
   }
 }
@@ -906,23 +1076,21 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
     unsigned BaseReg = BaseOp.getReg();
     bool BaseKill = BaseOp.isKill();
     bool BaseUndef = BaseOp.isUndef();
-    unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
     bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
     bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
     int OffImm = getMemoryOpOffset(MI);
     unsigned PredReg = 0;
     ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
 
-    if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
+    if (OddRegNum > EvenRegNum && OffImm == 0) {
       // Ascending register numbers and no offset. It's safe to change it to a
       // ldm or stm.
       unsigned NewOpc = (isLd)
-        ? (isT2 ? ARM::t2LDM : ARM::LDM)
-        : (isT2 ? ARM::t2STM : ARM::STM);
+        ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
+        : (isT2 ? ARM::t2STMIA : ARM::STMIA);
       if (isLd) {
         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
           .addReg(BaseReg, getKillRegState(BaseKill))
-          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
           .addImm(Pred).addReg(PredReg)
           .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
           .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
@@ -930,7 +1098,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
       } else {
         BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
           .addReg(BaseReg, getKillRegState(BaseKill))
-          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
           .addImm(Pred).addReg(PredReg)
           .addReg(EvenReg,
                   getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
@@ -941,28 +1108,24 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
       NewBBI = llvm::prior(MBBI);
     } else {
       // Split into two instructions.
-      assert((!isT2 || !OffReg) &&
-             "Thumb2 ldrd / strd does not encode offset register!");
       unsigned NewOpc = (isLd)
-        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
-        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
+        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
       DebugLoc dl = MBBI->getDebugLoc();
       // If this is a load and base register is killed, it may have been
       // re-defed by the load, make sure the first load does not clobber it.
       if (isLd &&
           (BaseKill || OffKill) &&
-          (TRI->regsOverlap(EvenReg, BaseReg) ||
-           (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
-        assert(!TRI->regsOverlap(OddReg, BaseReg) &&
-               (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
+          (TRI->regsOverlap(EvenReg, BaseReg))) {
+        assert(!TRI->regsOverlap(OddReg, BaseReg));
         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
                       OddReg, OddDeadKill, false,
-                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+                      BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
         NewBBI = llvm::prior(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, false,
-                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
                       Pred, PredReg, TII, isT2);
       } else {
         if (OddReg == EvenReg && EvenDeadKill) {
@@ -974,12 +1137,12 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
         }
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, EvenUndef,
-                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+                      BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
         NewBBI = llvm::prior(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
                       OddReg, OddDeadKill, OddUndef,
-                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
                       Pred, PredReg, TII, isT2);
       }
       if (isLd)
@@ -1158,17 +1321,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   return NumMerges > 0;
 }
 
-namespace {
-  struct OffsetCompare {
-    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
-      int LOffset = getMemoryOpOffset(LHS);
-      int ROffset = getMemoryOpOffset(RHS);
-      assert(LHS == RHS || LOffset != ROffset);
-      return LOffset > ROffset;
-    }
-  };
-}
-
 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
 /// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it
 /// directly restore the value of LR into pc.
@@ -1182,20 +1334,25 @@ namespace {
 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
   if (MBB.empty()) return false;
 
-  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   if (MBBI != MBB.begin() &&
       (MBBI->getOpcode() == ARM::BX_RET ||
        MBBI->getOpcode() == ARM::tBX_RET ||
        MBBI->getOpcode() == ARM::MOVPCLR)) {
     MachineInstr *PrevMI = prior(MBBI);
-    if (PrevMI->getOpcode() == ARM::LDM_UPD ||
-        PrevMI->getOpcode() == ARM::t2LDM_UPD) {
+    unsigned Opcode = PrevMI->getOpcode();
+    if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
+        Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
+        Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
       MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
       if (MO.getReg() != ARM::LR)
         return false;
-      unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
+      unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
+      assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
+              Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
       PrevMI->setDesc(TII->get(NewOpc));
       MO.setReg(ARM::PC);
+      PrevMI->copyImplicitOps(&*MBBI);
       MBB.erase(MBBI);
       return true;
     }
@@ -1216,7 +1373,8 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
        ++MFI) {
     MachineBasicBlock &MBB = *MFI;
     Modified |= LoadStoreMultipleOpti(MBB);
-    Modified |= MergeReturnIntoLDM(MBB);
+    if (TM.getSubtarget<ARMSubtarget>().hasV5TOps())
+      Modified |= MergeReturnIntoLDM(MBB);
   }
 
   delete RS;
@@ -1250,7 +1408,7 @@ namespace {
     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
                           unsigned &NewOpc, unsigned &EvenReg,
                           unsigned &OddReg, unsigned &BaseReg,
-                          unsigned &OffReg, int &Offset,
+                          int &Offset,
                           unsigned &PredReg, ARMCC::CondCodes &Pred,
                           bool &isT2);
     bool RescheduleOps(MachineBasicBlock *MBB,
@@ -1292,7 +1450,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
     if (I->isDebugValue() || MemOps.count(&*I))
       continue;
     const TargetInstrDesc &TID = I->getDesc();
-    if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
+    if (TID.isCall() || TID.isTerminator() || I->hasUnmodeledSideEffects())
       return false;
     if (isLd && TID.mayStore())
       return false;
@@ -1330,8 +1488,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
                                           DebugLoc &dl,
                                           unsigned &NewOpc, unsigned &EvenReg,
                                           unsigned &OddReg, unsigned &BaseReg,
-                                          unsigned &OffReg, int &Offset,
-                                          unsigned &PredReg,
+                                          int &Offset, unsigned &PredReg,
                                           ARMCC::CondCodes &Pred,
                                           bool &isT2) {
   // Make sure we're allowed to generate LDRD/STRD.
@@ -1341,9 +1498,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
   unsigned Scale = 1;
   unsigned Opcode = Op0->getOpcode();
-  if (Opcode == ARM::LDR)
+  if (Opcode == ARM::LDRi12)
     NewOpc = ARM::LDRD;
-  else if (Opcode == ARM::STR)
+  else if (Opcode == ARM::STRi12)
     NewOpc = ARM::STRD;
   else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
     NewOpc = ARM::t2LDRDi8;
@@ -1356,12 +1513,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   } else
     return false;
 
-  // Make sure the offset registers match.
-  if (!isT2 &&
-      (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
-      return false;
-
-  // Must sure the base address satisfies i64 ld / st alignment requirement.
+  // Make sure the base address satisfies i64 ld / st alignment requirement.
   if (!Op0->hasOneMemOperand() ||
       !(*Op0->memoperands_begin())->getValue() ||
       (*Op0->memoperands_begin())->isVolatile())
@@ -1370,7 +1522,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   unsigned Align = (*Op0->memoperands_begin())->getAlignment();
   const Function *Func = MF->getFunction();
   unsigned ReqAlign = STI->hasV6Ops()
-    ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) 
+    ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
     : 8;  // Pre-v6 need 8-byte align
   if (Align < ReqAlign)
     return false;
@@ -1404,13 +1556,22 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   if (EvenReg == OddReg)
     return false;
   BaseReg = Op0->getOperand(1).getReg();
-  if (!isT2)
-    OffReg = Op0->getOperand(2).getReg();
   Pred = llvm::getInstrPredicate(Op0, PredReg);
   dl = Op0->getDebugLoc();
   return true;
 }
 
+namespace {
+  struct OffsetCompare {
+    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
+      int LOffset = getMemoryOpOffset(LHS);
+      int ROffset = getMemoryOpOffset(RHS);
+      assert(LHS == RHS || LOffset != ROffset);
+      return LOffset > ROffset;
+    }
+  };
+}
+
 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
                                  SmallVector<MachineInstr*, 4> &Ops,
                                  unsigned Base, bool isLd,
@@ -1493,14 +1654,14 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
         MachineInstr *Op0 = Ops.back();
         MachineInstr *Op1 = Ops[Ops.size()-2];
         unsigned EvenReg = 0, OddReg = 0;
-        unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
+        unsigned BaseReg = 0, PredReg = 0;
         ARMCC::CondCodes Pred = ARMCC::AL;
         bool isT2 = false;
         unsigned NewOpc = 0;
         int Offset = 0;
         DebugLoc dl;
         if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
-                                             EvenReg, OddReg, BaseReg, OffReg,
+                                             EvenReg, OddReg, BaseReg,
                                              Offset, PredReg, Pred, isT2)) {
           Ops.pop_back();
           Ops.pop_back();
@@ -1512,8 +1673,11 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
               .addReg(EvenReg, RegState::Define)
               .addReg(OddReg, RegState::Define)
               .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming LDRi12s.
             if (!isT2)
-              MIB.addReg(OffReg);
+              MIB.addReg(0);
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             ++NumLDRDFormed;
           } else {
@@ -1522,8 +1686,11 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
               .addReg(EvenReg)
               .addReg(OddReg)
               .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming STRi12s.
             if (!isT2)
-              MIB.addReg(OffReg);
+              MIB.addReg(0);
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             ++NumSTRDFormed;
           }
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMMCCodeEmitter.cpp
new file mode 100644
index 0000000..6d7b485
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMCCodeEmitter.cpp
@@ -0,0 +1,1230 @@
+//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMFixupKinds.h"
+#include "ARMInstrInfo.h"
+#include "ARMMCExpr.h"
+#include "ARMSubtarget.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
+
+namespace {
+class ARMMCCodeEmitter : public MCCodeEmitter {
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const ARMSubtarget *Subtarget;
+  MCContext &Ctx;
+
+public:
+  ARMMCCodeEmitter(TargetMachine &tm, MCContext &ctx)
+    : TM(tm), TII(*TM.getInstrInfo()),
+      Subtarget(&TM.getSubtarget<ARMSubtarget>()), Ctx(ctx) {
+  }
+
+  ~ARMMCCodeEmitter() {}
+
+  unsigned getMachineSoImmOpValue(unsigned SoImm) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  unsigned getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
+  /// the specified operand. This is used for operands with :lower16: and
+  /// :upper16: prefixes.
+  uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
+                              unsigned &Reg, unsigned &Imm,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
+  /// BL branch target.
+  uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+  /// BLX branch target.
+  uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+  /// immediate Thumb2 direct branch target.
+  uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+  
+  /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
+  /// ADR label target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
+  uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups)const;
+
+  /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
+  /// operand as needed by load/store instructions.
+  uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
+  uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+    ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::da: return 0;
+    case ARM_AM::ia: return 1;
+    case ARM_AM::db: return 2;
+    case ARM_AM::ib: return 3;
+    }
+  }
+  /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+  ///
+  unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
+    switch (ShOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::no_shift:
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::asr: return 2;
+    case ARM_AM::ror:
+    case ARM_AM::rrx: return 3;
+    }
+    return 0;
+  }
+
+  /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
+  uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
+  uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
+  uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
+  uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+  uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+  uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
+  uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getCCOutOpValue - Return encoding of the 's' bit.
+  unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
+    // '1' respectively.
+    return MI.getOperand(Op).getReg() == ARM::CPSR;
+  }
+
+  /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+    assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+    // Encode rotate_imm.
+    unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+      << ARMII::SoRotImmShift;
+
+    // Encode immed_8.
+    Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+    return Binary;
+  }
+
+  /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
+    assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
+    return Encoded;
+  }
+
+  unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getSORegOpValue - Return an encoded so_reg shifted register value.
+  unsigned getSORegOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getRotImmOpValue(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups) const {
+    switch (MI.getOperand(Op).getImm()) {
+    default: assert (0 && "Not a valid rot_imm value!");
+    case 0:  return 0;
+    case 8:  return 1;
+    case 16: return 2;
+    case 24: return 3;
+    }
+  }
+
+  unsigned getImmMinusOneOpValue(const MCInst &MI, unsigned Op,
+                                 SmallVectorImpl<MCFixup> &Fixups) const {
+    return MI.getOperand(Op).getImm() - 1;
+  }
+
+  unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
+                                   SmallVectorImpl<MCFixup> &Fixups) const {
+    return 64 - MI.getOperand(Op).getImm();
+  }
+
+  unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getMsbOpValue(const MCInst &MI, unsigned Op,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                      unsigned EncodedValue) const;
+  unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                          unsigned EncodedValue) const;
+  unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
+                                    unsigned EncodedValue) const;
+
+  unsigned VFPThumb2PostEncoder(const MCInst &MI,
+                                unsigned EncodedValue) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARMMCCodeEmitter(const Target &, TargetMachine &TM,
+                                            MCContext &Ctx) {
+  return new ARMMCCodeEmitter(TM, Ctx);
+}
+
+/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (Subtarget->isThumb2()) {
+    // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
+    // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
+    // set to 1111.
+    unsigned Bit24 = EncodedValue & 0x01000000;
+    unsigned Bit28 = Bit24 << 4;
+    EncodedValue &= 0xEFFFFFFF;
+    EncodedValue |= Bit28;
+    EncodedValue |= 0x0F000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (Subtarget->isThumb2()) {
+    EncodedValue &= 0xF0FFFFFF;
+    EncodedValue |= 0x09000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (Subtarget->isThumb2()) {
+    EncodedValue &= 0x00FFFFFF;
+    EncodedValue |= 0xEE000000;
+  }
+
+  return EncodedValue;
+}
+
+/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
+/// them to their Thumb2 form if we are currently in Thumb2 mode.
+unsigned ARMMCCodeEmitter::
+VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const {
+  if (Subtarget->isThumb2()) {
+    EncodedValue &= 0x0FFFFFFF;
+    EncodedValue |= 0xE0000000;
+  }
+  return EncodedValue;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = getARMRegisterNumbering(Reg);
+
+    // Q registers are encoded as 2x their register number.
+    switch (Reg) {
+    default:
+      return RegNo;
+    case ARM::Q0:  case ARM::Q1:  case ARM::Q2:  case ARM::Q3:
+    case ARM::Q4:  case ARM::Q5:  case ARM::Q6:  case ARM::Q7:
+    case ARM::Q8:  case ARM::Q9:  case ARM::Q10: case ARM::Q11:
+    case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15:
+      return 2 * RegNo;
+    }
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+                     .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+
+  llvm_unreachable("Unable to encode MCOperand!");
+  return 0;
+}
+
+/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
+bool ARMMCCodeEmitter::
+EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
+                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+
+  Reg = getARMRegisterNumbering(MO.getReg());
+
+  int32_t SImm = MO1.getImm();
+  bool isAdd = true;
+
+  // Special value for #-0
+  if (SImm == INT32_MIN)
+    SImm = 0;
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (SImm < 0) {
+    SImm = -SImm;
+    isAdd = false;
+  }
+
+  Imm = SImm;
+  return isAdd;
+}
+
+/// getBranchTargetOpValue - Helper function to get the branch target operand,
+/// which is either an immediate or requires a fixup.
+static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       unsigned FixupKind,
+                                       SmallVectorImpl<MCFixup> &Fixups) {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() && "Unexpected branch target type!");
+  const MCExpr *Expr = MO.getExpr();
+  MCFixupKind Kind = MCFixupKind(FixupKind);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl, Fixups);
+}
+
+/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+/// BLX branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx, Fixups);
+}
+
+/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br, Fixups);
+}
+
+/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc, Fixups);
+}
+
+/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups);
+}
+
+/// Return true if this branch has a non-always predication
+static bool HasConditionalBranch(const MCInst &MI) {
+  int NumOp = MI.getNumOperands();
+  if (NumOp >= 2) {
+    for (int i = 0; i < NumOp-1; ++i) {
+      const MCOperand &MCOp1 = MI.getOperand(i);
+      const MCOperand &MCOp2 = MI.getOperand(i + 1);
+      if (MCOp1.isImm() && MCOp2.isReg() && 
+          (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
+        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL) 
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // FIXME: This really, really shouldn't use TargetMachine. We don't want
+  // coupling between MC and TM anywhere we can help it.
+  if (Subtarget->isThumb2())
+    return
+      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups);
+  return getARMBranchTargetOpValue(MI, OpIdx, Fixups);
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  if (HasConditionalBranch(MI)) 
+    return ::getBranchTargetOpValue(MI, OpIdx,
+                                    ARM::fixup_arm_condbranch, Fixups);
+  return ::getBranchTargetOpValue(MI, OpIdx, 
+                                  ARM::fixup_arm_uncondbranch, Fixups);
+}
+
+
+
+
+/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+/// immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Val =
+    ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  bool I  = (Val & 0x800000);
+  bool J1 = (Val & 0x400000);
+  bool J2 = (Val & 0x200000);
+  if (I ^ J1)
+    Val &= ~0x400000;
+  else
+    Val |= 0x400000;
+
+  if (I ^ J2)
+    Val &= ~0x200000;
+  else
+    Val |= 0x200000;
+
+  return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
+                                  Fixups);
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
+                                  Fixups);
+}
+
+/// getAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
+                                  Fixups);
+}
+
+/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &) const {
+  // [Rn, Rm]
+  //   {5-3} = Rm
+  //   {2-0} = Rn
+  const MCOperand &MO1 = MI.getOperand(OpIdx);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = getARMRegisterNumbering(MO1.getReg());
+  unsigned Rm = getARMRegisterNumbering(MO2.getReg());
+  return (Rm << 3) | Rn;
+}
+
+/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {17-13} = reg
+  // {12}    = (U)nsigned (add == '1', sub == '0')
+  // {11-0}  = imm12
+  unsigned Reg, Imm12;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Imm12 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+
+    MCFixupKind Kind;
+    if (Subtarget->isThumb2())
+      Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+    ++MCNumCPRelocations;
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups);
+
+  uint32_t Binary = Imm12 & 0xfff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 12);
+  Binary |= (Reg << 13);
+  return Binary;
+}
+
+/// getT2AddrModeImm8s4OpValue - Return encoding info for
+/// 'reg +/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+    ++MCNumCPRelocations;
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+
+  uint32_t Binary = (Imm8 >> 2) & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+// FIXME: This routine assumes that a binary
+// expression will always result in a PCRel expression
+// In reality, its only true if one or more subexpressions
+// is itself a PCRel (i.e. "." in asm or some other pcrel construct)
+// but this is good enough for now.
+static bool EvaluateAsPCRel(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  default: assert(0 && "Unexpected expression type");
+  case MCExpr::SymbolRef: return false;
+  case MCExpr::Binary: return true;
+  }
+}
+
+uint32_t
+ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups) const {
+  // {20-16} = imm{15-12}
+  // {11-0}  = imm{11-0}
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    // Hi / lo 16 bits already extracted during earlier passes.
+    return static_cast<unsigned>(MO.getImm());
+
+  // Handle :upper16: and :lower16: assembly prefixes.
+  const MCExpr *E = MO.getExpr();
+  if (E->getKind() == MCExpr::Target) {
+    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+    E = ARM16Expr->getSubExpr();
+
+    MCFixupKind Kind;
+    switch (ARM16Expr->getKind()) {
+    default: assert(0 && "Unsupported ARMFixup");
+    case ARMMCExpr::VK_ARM_HI16:
+      if (!Subtarget->isTargetDarwin() && EvaluateAsPCRel(E))
+        Kind = MCFixupKind(Subtarget->isThumb2()
+                           ? ARM::fixup_t2_movt_hi16_pcrel
+                           : ARM::fixup_arm_movt_hi16_pcrel);
+      else
+        Kind = MCFixupKind(Subtarget->isThumb2()
+                           ? ARM::fixup_t2_movt_hi16
+                           : ARM::fixup_arm_movt_hi16);
+      break;
+    case ARMMCExpr::VK_ARM_LO16:
+      if (!Subtarget->isTargetDarwin() && EvaluateAsPCRel(E))
+        Kind = MCFixupKind(Subtarget->isThumb2()
+                           ? ARM::fixup_t2_movw_lo16_pcrel
+                           : ARM::fixup_arm_movw_lo16_pcrel);
+      else
+        Kind = MCFixupKind(Subtarget->isThumb2()
+                           ? ARM::fixup_t2_movw_lo16
+                           : ARM::fixup_arm_movw_lo16);
+      break;
+    }
+    Fixups.push_back(MCFixup::Create(0, E, Kind));
+    return 0;
+  };
+
+  llvm_unreachable("Unsupported MCExpr type in MCOperand!");
+  return 0;
+}
+
+uint32_t ARMMCCodeEmitter::
+getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Rm = getARMRegisterNumbering(MO1.getReg());
+  unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
+  bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
+  ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
+  unsigned SBits = getShiftOp(ShOp);
+
+  // {16-13} = Rn
+  // {12}    = isAdd
+  // {11-0}  = shifter
+  //  {3-0}  = Rm
+  //  {4}    = 0
+  //  {6-5}  = type
+  //  {11-7} = imm
+  uint32_t Binary = Rm;
+  Binary |= Rn << 13;
+  Binary |= SBits << 5;
+  Binary |= ShImm << 7;
+  if (isAdd)
+    Binary |= 1 << 12;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {17-14}  Rn
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
+  Binary |= Rn << 14;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add;
+  bool isReg = MO.getReg() != 0;
+  uint32_t Binary = ARM_AM::getAM2Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12
+  if (isReg) {
+    ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
+    Binary <<= 7;                    // Shift amount is bits [11:7]
+    Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
+    Binary |= getARMRegisterNumbering(MO.getReg()); // Rm is bits [3:0]
+  }
+  return Binary | (isAdd << 12) | (isReg << 13);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // {9}      1 == imm8, 0 == Rm
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = getARMRegisterNumbering(MO.getReg());
+  return Imm8 | (isAdd << 8) | (isImm << 9);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Imm = MO2.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO1.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = getARMRegisterNumbering(MO1.getReg());
+  return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
+}
+
+/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // [SP, #imm]
+  //   {7-0} = imm8
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(MI.getOperand(OpIdx).getReg() == ARM::SP &&
+         "Unexpected base register!");
+
+  // The immediate is already shifted for the implicit zeroes, so no change
+  // here.
+  return MO1.getImm() & 0xff;
+}
+
+/// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  // [Rn, #imm]
+  //   {7-3} = imm5
+  //   {2-0} = Rn
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = getARMRegisterNumbering(MO.getReg());
+  unsigned Imm5 = MO1.getImm();
+  return ((Imm5 & 0x1f) << 3) | Rn;
+}
+
+/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups);
+}
+
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = getARMRegisterNumbering(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (Subtarget->isThumb2())
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is either Rs, the amount to shift by, or reg0 in which
+  // case the imm contains the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 1 if reg shift, 0 if imm shift
+  // {6-5} = type
+  //    If reg shift:
+  //      {11-8} = Rs
+  //      {7}    = 0
+  //    else (imm shift)
+  //      {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode Rm.
+  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    // RRX - 0110 and bit[11:8] clear.
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    case ARM_AM::rrx: SBits = 0x6; break;
+    }
+  } else {
+    // Set shift operand (bit[6:4]).
+    // LSL - 000
+    // LSR - 010
+    // ASR - 100
+    // ROR - 110
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x0; break;
+    case ARM_AM::lsr: SBits = 0x2; break;
+    case ARM_AM::asr: SBits = 0x4; break;
+    case ARM_AM::ror: SBits = 0x6; break;
+    }
+  }
+
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode the shift operation Rs or shift_imm (except rrx).
+  if (Rs) {
+    // Encode Rs bit[11:8].
+    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+    return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
+  }
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+  const MCOperand &MO3 = MI.getOperand(OpNum+2);
+
+  // Encoded as [Rn, Rm, imm].
+  // FIXME: Needs fixup support.
+  unsigned Value = getARMRegisterNumbering(MO1.getReg());
+  Value <<= 4;
+  Value |= getARMRegisterNumbering(MO2.getReg());
+  Value <<= 2;
+  Value |= MO3.getImm();
+
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = getARMRegisterNumbering(MO1.getReg());
+
+  // Even though the immediate is 8 bits long, we need 9 bits in order
+  // to represent the (inverse of the) sign bit.
+  Value <<= 9;
+  int32_t tmp = (int32_t)MO2.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 4096; // Set the ADD bit
+  Value |= tmp & 4095;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::ror: SBits = 0x6; break;
+  }
+
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7;
+}
+
+unsigned ARMMCCodeEmitter::
+getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+  // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+  // msb of the mask.
+  const MCOperand &MO = MI.getOperand(Op);
+  uint32_t v = ~MO.getImm();
+  uint32_t lsb = CountTrailingZeros_32(v);
+  uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1;
+  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+  return lsb | (msb << 5);
+}
+
+unsigned ARMMCCodeEmitter::
+getMsbOpValue(const MCInst &MI, unsigned Op,
+              SmallVectorImpl<MCFixup> &Fixups) const {
+  // MSB - 5 bits.
+  uint32_t lsb = MI.getOperand(Op-1).getImm();
+  uint32_t width = MI.getOperand(Op).getImm();
+  uint32_t msb = lsb+width-1;
+  assert (width != 0 && msb < 32 && "Illegal bit width!");
+  return msb;
+}
+
+unsigned ARMMCCodeEmitter::
+getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // VLDM/VSTM:
+  //   {12-8} = Vd
+  //   {7-0}  = Number of registers
+  //
+  // LDM/STM:
+  //   {15-0}  = Bitfield of GPRs.
+  unsigned Reg = MI.getOperand(Op).getReg();
+  bool SPRRegs = ARM::SPRRegClass.contains(Reg);
+  bool DPRRegs = ARM::DPRRegClass.contains(Reg);
+
+  unsigned Binary = 0;
+
+  if (SPRRegs || DPRRegs) {
+    // VLDM/VSTM
+    unsigned RegNo = getARMRegisterNumbering(Reg);
+    unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
+    Binary |= (RegNo & 0x1f) << 8;
+    if (SPRRegs)
+      Binary |= NumRegs;
+    else
+      Binary |= NumRegs * 2;
+  } else {
+    for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
+      unsigned RegNo = getARMRegisterNumbering(MI.getOperand(I).getReg());
+      Binary |= 1 << RegNo;
+    }
+  }
+
+  return Binary;
+}
+
+/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along
+/// with the alignment operand.
+unsigned ARMMCCodeEmitter::
+getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x02; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
+/// alignment operand for use in VLD-dup instructions.  This is the same as
+/// getAddrMode6AddressOpValue except for the alignment encoding, which is
+/// different for VLD4-dup.
+unsigned ARMMCCodeEmitter::
+getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+unsigned ARMMCCodeEmitter::
+getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  if (MO.getReg() == 0) return 0x0D;
+  return MO.getReg();
+}
+
+void ARMMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  // Pseudo instructions don't get encoded.
+  const TargetInstrDesc &Desc = TII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+  if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo)
+    return;
+  int Size;
+  // Basic size info comes from the TSFlags field.
+  switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
+  default: llvm_unreachable("Unexpected instruction size!");
+  case ARMII::Size2Bytes: Size = 2; break;
+  case ARMII::Size4Bytes: Size = 4; break;
+  }
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+  // Thumb 32-bit wide instructions need to emit the high order halfword
+  // first.
+  if (Subtarget->isThumb() && Size == 4) {
+    EmitConstant(Binary >> 16, 2, OS);
+    EmitConstant(Binary & 0xffff, 2, OS);
+  } else
+    EmitConstant(Binary, Size, OS);
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+#include "ARMGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/ARMMCExpr.cpp
new file mode 100644
index 0000000..2727ba8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMCExpr.cpp
@@ -0,0 +1,73 @@
+//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "armmcexpr"
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAssembler.h"
+using namespace llvm;
+
+const ARMMCExpr*
+ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                       MCContext &Ctx) {
+  return new (Ctx) ARMMCExpr(Kind, Expr);
+}
+
+void ARMMCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: assert(0 && "Invalid kind!");
+  case VK_ARM_HI16: OS << ":upper16:"; break;
+  case VK_ARM_LO16: OS << ":lower16:"; break;
+  }
+
+  const MCExpr *Expr = getSubExpr();
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  Expr->print(OS);
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+bool
+ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout) const {
+  return false;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    assert(0 && "Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbols_(BE->getLHS(), Asm);
+    AddValueSymbols_(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbols_(getSubExpr(), Asm);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/ARMMCExpr.h
new file mode 100644
index 0000000..d42f766
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMCExpr.h
@@ -0,0 +1,73 @@
+//===-- ARMMCExpr.h - ARM specific MC expression classes ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMCEXPR_H
+#define ARMMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class ARMMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_ARM_None,
+    VK_ARM_HI16,  // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file)
+    VK_ARM_LO16   // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+  
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                      MCContext &Ctx);
+
+  static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_HI16, Expr, Ctx);
+  }
+
+  static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_LO16, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+  
+  static bool classof(const ARMMCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index ab2b06b..59d6050 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -12,122 +12,69 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMMCInstLower.h"
-//#include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/CodeGen/AsmPrinter.h"
+#include "ARM.h"
+#include "ARMAsmPrinter.h"
+#include "ARMMCExpr.h"
+#include "llvm/Constants.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-//#include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallString.h"
 using namespace llvm;
 
 
-#if 0
-const ARMSubtarget &ARMMCInstLower::getSubtarget() const {
-  return AsmPrinter.getSubtarget();
-}
-
-MachineModuleInfoMachO &ARMMCInstLower::getMachOMMI() const {
-  assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
-  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); 
-}
-#endif
-
-MCSymbol *ARMMCInstLower::
-GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  // FIXME: HANDLE PLT references how??
-  switch (MO.getTargetFlags()) {
-  default: assert(0 && "Unknown target flag on GV operand");
-  case 0: break;
-  }
-  
-  return Printer.Mang->getSymbol(MO.getGlobal());
-}
-
-MCSymbol *ARMMCInstLower::
-GetExternalSymbolSymbol(const MachineOperand &MO) const {
-  // FIXME: HANDLE PLT references how??
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              ARMAsmPrinter &Printer) {
+  MCContext &Ctx = Printer.OutContext;
+  const MCExpr *Expr;
   switch (MO.getTargetFlags()) {
-  default: assert(0 && "Unknown target flag on GV operand");
-  case 0: break;
+  default: {
+    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, Ctx);
+    switch (MO.getTargetFlags()) {
+    default:
+      assert(0 && "Unknown target flag on symbol operand");
+    case 0:
+      break;
+    case ARMII::MO_LO16:
+      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, Ctx);
+      Expr = ARMMCExpr::CreateLower16(Expr, Ctx);
+      break;
+    case ARMII::MO_HI16:
+      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, Ctx);
+      Expr = ARMMCExpr::CreateUpper16(Expr, Ctx);
+      break;
+    }
+    break;
   }
-  
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
-}
-
 
-
-MCSymbol *ARMMCInstLower::
-GetJumpTableSymbol(const MachineOperand &MO) const {
-  SmallString<256> Name;
-  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
-    << Printer.getFunctionNumber() << '_' << MO.getIndex();
-  
-#if 0
-  switch (MO.getTargetFlags()) {
-    default: llvm_unreachable("Unknown target flag on GV operand");
+  case ARMII::MO_PLT:
+    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_ARM_PLT, Ctx);
+    break;
   }
-#endif
-  
-  // Create a symbol for the name.
-  return Ctx.GetOrCreateSymbol(Name.str());
-}
 
-MCSymbol *ARMMCInstLower::
-GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
-  SmallString<256> Name;
-  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
-    << Printer.getFunctionNumber() << '_' << MO.getIndex();
-  
-#if 0
-  switch (MO.getTargetFlags()) {
-  default: llvm_unreachable("Unknown target flag on GV operand");
-  }
-#endif
-  
-  // Create a symbol for the name.
-  return Ctx.GetOrCreateSymbol(Name.str());
-}
-  
-MCOperand ARMMCInstLower::
-LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
-  // FIXME: We would like an efficient form for this, so we don't have to do a
-  // lot of extra uniquing.
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
-  
-#if 0
-  switch (MO.getTargetFlags()) {
-  default: llvm_unreachable("Unknown target flag on GV operand");
-  }
-#endif
-  
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::CreateAdd(Expr,
                                    MCConstantExpr::Create(MO.getOffset(), Ctx),
                                    Ctx);
   return MCOperand::CreateExpr(Expr);
-}
 
+}
 
-void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        ARMAsmPrinter &AP) {
   OutMI.setOpcode(MI->getOpcode());
-  
+
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    
+
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
       MI->dump();
       assert(0 && "unknown operand type");
     case MachineOperand::MO_Register:
-      // Ignore all implicit register operands.
-      if (MO.isImplicit()) continue;
+      // Ignore all non-CPSR implicit register operands.
+      if (MO.isImplicit() && MO.getReg() != ARM::CPSR) continue;
       assert(!MO.getSubReg() && "Subregs should be eliminated!");
       MCOp = MCOperand::CreateReg(MO.getReg());
       break;
@@ -136,27 +83,33 @@ void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
     case MachineOperand::MO_MachineBasicBlock:
       MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                       MO.getMBB()->getSymbol(), Ctx));
+                       MO.getMBB()->getSymbol(), AP.OutContext));
       break;
     case MachineOperand::MO_GlobalAddress:
-      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      MCOp = GetSymbolRef(MO, AP.Mang->getSymbol(MO.getGlobal()), AP);
       break;
     case MachineOperand::MO_ExternalSymbol:
-      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      MCOp = GetSymbolRef(MO,
+                          AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP);
       break;
     case MachineOperand::MO_JumpTableIndex:
-      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
       break;
     case MachineOperand::MO_ConstantPoolIndex:
-      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
       break;
     case MachineOperand::MO_BlockAddress:
-      MCOp = LowerSymbolOperand(MO, Printer.GetBlockAddressSymbol(
-                                              MO.getBlockAddress()));
+      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP);
       break;
+    case MachineOperand::MO_FPImmediate: {
+      APFloat Val = MO.getFPImm()->getValueAPF();
+      bool ignored;
+      Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+      MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
+      break;
+    }
     }
-    
+
     OutMI.addOperand(MCOp);
   }
-  
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.h b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.h
deleted file mode 100644
index b81a306..0000000
--- a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//===-- ARMMCInstLower.h - Lower MachineInstr to MCInst -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_MCINSTLOWER_H
-#define ARM_MCINSTLOWER_H
-
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-  class AsmPrinter;
-  class MCAsmInfo;
-  class MCContext;
-  class MCInst;
-  class MCOperand;
-  class MCSymbol;
-  class MachineInstr;
-  class MachineModuleInfoMachO;
-  class MachineOperand;
-  class Mangler;
-  //class ARMSubtarget;
-  
-/// ARMMCInstLower - This class is used to lower an MachineInstr into an MCInst.
-class LLVM_LIBRARY_VISIBILITY ARMMCInstLower {
-  MCContext &Ctx;
-  Mangler &Mang;
-  AsmPrinter &Printer;
-
-  //const ARMSubtarget &getSubtarget() const;
-public:
-  ARMMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
-    : Ctx(ctx), Mang(mang), Printer(printer) {}
-  
-  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-  //MCSymbol *GetPICBaseSymbol() const;
-  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
-  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
-  
-/*
-private:
-  MachineModuleInfoMachO &getMachOMMI() const;
- */
-};
-
-}
-
-#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 514c26b..138f0c2 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -22,8 +22,8 @@
 
 namespace llvm {
 
-/// ARMFunctionInfo - This class is derived from MachineFunction private
-/// ARM target-specific information for each MachineFunction.
+/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private ARM-specific information for each MachineFunction.
 class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// isThumb - True if this function is compiled under Thumb mode.
@@ -79,15 +79,11 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   BitVector GPRCS2Frames;
   BitVector DPRCSFrames;
 
-  /// SpilledCSRegs - A BitVector mask of all spilled callee-saved registers.
-  ///
-  BitVector SpilledCSRegs;
-
   /// JumpTableUId - Unique id for jumptables.
   ///
   unsigned JumpTableUId;
 
-  unsigned ConstPoolEntryUId;
+  unsigned PICLabelUId;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
@@ -95,6 +91,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// HasITBlocks - True if IT blocks have been inserted.
   bool HasITBlocks;
 
+  /// CPEClones - Track constant pool entries clones created by Constant Island
+  /// pass.
+  DenseMap<unsigned, unsigned> CPEClones;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -104,8 +104,8 @@ public:
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
-    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0),
-    HasITBlocks(false) {}
+    JumpTableUId(0), PICLabelUId(0),
+    VarArgsFrameIndex(0), HasITBlocks(false) {}
 
   explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
@@ -115,9 +115,8 @@ public:
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
-    SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()),
-    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0),
-    HasITBlocks(false) {}
+    JumpTableUId(0), PICLabelUId(0),
+    VarArgsFrameIndex(0), HasITBlocks(false) {}
 
   bool isThumbFunction() const { return isThumb; }
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
@@ -207,18 +206,6 @@ public:
     }
   }
 
-  void setCSRegisterIsSpilled(unsigned Reg) {
-    SpilledCSRegs.set(Reg);
-  }
-
-  bool isCSRegisterSpilled(unsigned Reg) const {
-    return SpilledCSRegs[Reg];
-  }
-
-  const BitVector &getSpilledCSRegisters() const {
-    return SpilledCSRegs;
-  }
-
   unsigned createJumpTableUId() {
     return JumpTableUId++;
   }
@@ -227,16 +214,16 @@ public:
     return JumpTableUId;
   }
 
-  void initConstPoolEntryUId(unsigned UId) {
-    ConstPoolEntryUId = UId;
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
   }
 
-  unsigned getNumConstPoolEntries() const {
-    return ConstPoolEntryUId;
+  unsigned getNumPICLabels() const {
+    return PICLabelUId;
   }
 
-  unsigned createConstPoolEntryUId() {
-    return ConstPoolEntryUId++;
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
   }
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
@@ -244,6 +231,19 @@ public:
 
   bool hasITBlocks() const { return HasITBlocks; }
   void setHasITBlocks(bool h) { HasITBlocks = h; }
+
+  void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
+    if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
+      assert(0 && "Duplicate entries!");
+  }
+
+  unsigned getOriginalCPIdx(unsigned CloneIdx) const {
+    DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx);
+    if (I != CPEClones.end())
+      return I->second;
+    else
+      return -1U;
+  }
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
index 5ff7c38..edecc4b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
+++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
@@ -21,6566 +21,6566 @@
 
 // This table is 6561*4 = 26244 bytes in size.
 static const unsigned PerfectShuffleTable[6561+1] = {
-  135053414U,	// <0,0,0,0>: Cost 1 vdup0 LHS
-  1543503974U,	// <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
-  2618572962U,	// <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
-  2568054923U,	// <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1476398390U,	// <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
-  2550140624U,	// <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
-  2550141434U,	// <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
-  2591945711U,	// <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U,	// <0,0,0,u>: Cost 1 vdup0 LHS
-  2886516736U,	// <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
-  1812775014U,	// <0,0,1,1>: Cost 2 vzipl LHS, LHS
-  1618133094U,	// <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2625209292U,	// <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
-  2886558034U,	// <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
-  2617246864U,	// <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
-  3659723031U,	// <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
-  2591953904U,	// <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
-  1812775581U,	// <0,0,1,u>: Cost 2 vzipl LHS, LHS
-  3020734464U,	// <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
-  3020734474U,	// <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
-  1946992742U,	// <0,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2631181989U,	// <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
-  3020734668U,	// <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
-  3826550569U,	// <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
-  2617247674U,	// <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
-  2591962097U,	// <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1946992796U,	// <0,0,2,u>: Cost 2 vtrnl LHS, LHS
-  2635163787U,	// <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2686419196U,	// <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
-  2686492933U,	// <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
-  2617248156U,	// <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
-  2617248258U,	// <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
-  3826551298U,	// <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
-  3690990200U,	// <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
-  3713551042U,	// <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
-  2635163787U,	// <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2617248658U,	// <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
-  2888450150U,	// <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021570150U,	// <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  3641829519U,	// <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
-  3021570252U,	// <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
-  1543507254U,	// <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752810294U,	// <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  3786998152U,	// <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
-  1543507497U,	// <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
-  2684354972U,	// <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
-  2617249488U,	// <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
-  3765617070U,	// <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
-  3635865780U,	// <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
-  2617249734U,	// <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
-  2617249796U,	// <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
-  2718712274U,	// <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
-  2617249960U,	// <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
-  2720039396U,	// <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
-  2684355053U,	// <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
-  3963609190U,	// <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
-  2617250298U,	// <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
-  3796435464U,	// <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
-  3659762998U,	// <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
-  3659763810U,	// <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
-  2617250616U,	// <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
-  2657727309U,	// <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
-  2658390942U,	// <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
-  2659054575U,	// <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  3635880854U,	// <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
-  3635881401U,	// <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
-  3734787298U,	// <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
-  2617251174U,	// <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
-  3659772002U,	// <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
-  3659772189U,	// <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
-  2617251436U,	// <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
-  2659054575U,	// <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  135053414U,	// <0,0,u,0>: Cost 1 vdup0 LHS
-  1817419878U,	// <0,0,u,1>: Cost 2 vzipl LHS, LHS
-  1947435110U,	// <0,0,u,2>: Cost 2 vtrnl LHS, LHS
-  2568120467U,	// <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
-  1476463926U,	// <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
-  1543510170U,	// <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752813210U,	// <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  2592011255U,	// <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
-  135053414U,	// <0,0,u,u>: Cost 1 vdup0 LHS
-  2618581002U,	// <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
-  1557446758U,	// <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2618581155U,	// <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
-  2690548468U,	// <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
-  2626543954U,	// <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
-  4094985216U,	// <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
-  2592019278U,	// <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
-  2592019448U,	// <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
-  1557447325U,	// <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
-  1476476938U,	// <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
-  2886517556U,	// <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
-  2886517654U,	// <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
-  2886517720U,	// <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
-  1476480310U,	// <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
-  2886558864U,	// <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
-  2550223354U,	// <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
-  2550223856U,	// <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
-  1476482862U,	// <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
-  1494401126U,	// <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
-  3020735284U,	// <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
-  2562172349U,	// <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
-  835584U,	// <0,1,2,3>: Cost 0 copy LHS
-  1494404406U,	// <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
-  3020735488U,	// <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
-  2631190458U,	// <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
-  1518294010U,	// <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
-  835584U,	// <0,1,2,u>: Cost 0 copy LHS
-  2692318156U,	// <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
-  2691875800U,	// <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
-  2691875806U,	// <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
-  2692539367U,	// <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
-  2562182454U,	// <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
-  2691875840U,	// <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
-  2692760578U,	// <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
-  2639817411U,	// <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
-  2691875863U,	// <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
-  2568159334U,	// <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
-  4095312692U,	// <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
-  2568160934U,	// <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
-  2568161432U,	// <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
-  2568162614U,	// <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
-  1557450038U,	// <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754235702U,	// <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  2592052220U,	// <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
-  1557450281U,	// <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
-  3765617775U,	// <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
-  2647781007U,	// <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
-  3704934138U,	// <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
-  2691875984U,	// <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
-  2657734598U,	// <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
-  2650435539U,	// <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
-  2651099172U,	// <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
-  2651762805U,	// <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
-  2691876029U,	// <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
-  2592063590U,	// <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
-  3765617871U,	// <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
-  2654417337U,	// <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
-  3765617889U,	// <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
-  2592066870U,	// <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
-  3765617907U,	// <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
-  2657071869U,	// <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
-  1583993678U,	// <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
-  1584657311U,	// <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
-  2657735672U,	// <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
-  2657735808U,	// <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
-  2631193772U,	// <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
-  2661053667U,	// <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
-  2657736038U,	// <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
-  3721524621U,	// <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
-  2657736158U,	// <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
-  2657736300U,	// <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
-  2657736322U,	// <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
-  1494450278U,	// <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
-  1557452590U,	// <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2754238254U,	// <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
-  835584U,	// <0,1,u,3>: Cost 0 copy LHS
-  1494453558U,	// <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
-  1557452954U,	// <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754238618U,	// <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  1518343168U,	// <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
-  835584U,	// <0,1,u,u>: Cost 0 copy LHS
-  2752299008U,	// <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
-  1544847462U,	// <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678557286U,	// <0,2,0,2>: Cost 2 vuzpl LHS, LHS
-  2696521165U,	// <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
-  2752340172U,	// <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
-  2691876326U,	// <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
-  2618589695U,	// <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
-  2592093185U,	// <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
-  1678557340U,	// <0,2,0,u>: Cost 2 vuzpl LHS, LHS
-  2618589942U,	// <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
-  2752299828U,	// <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
-  2886518376U,	// <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
-  2752299766U,	// <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  2550295862U,	// <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
-  2752340992U,	// <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
-  2886559674U,	// <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
-  3934208106U,	// <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
-  2752340771U,	// <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
-  1476558868U,	// <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
-  2226628029U,	// <0,2,2,1>: Cost 3 vrev <2,0,1,2>
-  2752300648U,	// <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
-  3020736114U,	// <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476562230U,	// <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
-  2550304464U,	// <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
-  2618591162U,	// <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
-  2550305777U,	// <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
-  1476564782U,	// <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
-  2618591382U,	// <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
-  2752301206U,	// <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  3826043121U,	// <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
-  2752301468U,	// <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618591746U,	// <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
-  2752301570U,	// <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  3830688102U,	// <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
-  2698807012U,	// <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
-  2752301269U,	// <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562261094U,	// <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
-  4095313828U,	// <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
-  2226718152U,	// <0,2,4,2>: Cost 3 vrev <2,0,2,4>
-  2568235169U,	// <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
-  2562264374U,	// <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
-  1544850742U,	// <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678560566U,	// <0,2,4,6>: Cost 2 vuzpl LHS, RHS
-  2592125957U,	// <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1678560584U,	// <0,2,4,u>: Cost 2 vuzpl LHS, RHS
-  2691876686U,	// <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
-  2618592976U,	// <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
-  3765618528U,	// <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
-  3765618536U,	// <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
-  2618593222U,	// <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
-  2752303108U,	// <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
-  2618593378U,	// <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
-  2824785206U,	// <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2824785207U,	// <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2752303950U,	// <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
-  3830690081U,	// <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
-  2618593786U,	// <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
-  2691876794U,	// <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
-  2752303990U,	// <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
-  3830690445U,	// <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
-  2752303928U,	// <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
-  2657743695U,	// <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
-  2691876839U,	// <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
-  2659070961U,	// <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  2659734594U,	// <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
-  3734140051U,	// <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
-  2701166596U,	// <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
-  2662389094U,	// <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
-  2662389126U,	// <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
-  3736794583U,	// <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
-  2752304748U,	// <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
-  2659070961U,	// <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  1476608026U,	// <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
-  1544853294U,	// <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678563118U,	// <0,2,u,2>: Cost 2 vuzpl LHS, LHS
-  3021178482U,	// <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476611382U,	// <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
-  1544853658U,	// <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678563482U,	// <0,2,u,6>: Cost 2 vuzpl LHS, RHS
-  2824785449U,	// <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  1678563172U,	// <0,2,u,u>: Cost 2 vuzpl LHS, LHS
-  2556329984U,	// <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
-  2686421142U,	// <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
-  2562303437U,	// <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
-  4094986652U,	// <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
-  2556333366U,	// <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
-  4094986754U,	// <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
-  3798796488U,	// <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
-  3776530634U,	// <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
-  2556335918U,	// <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
-  2886518934U,	// <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
-  2556338933U,	// <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
-  2691877105U,	// <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
-  2886519196U,	// <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
-  2886519298U,	// <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
-  4095740418U,	// <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
-  3659944242U,	// <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
-  3769600286U,	// <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
-  2886519582U,	// <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
-  1482604646U,	// <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
-  1482605302U,	// <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
-  2556348008U,	// <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
-  3020736924U,	// <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482607926U,	// <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
-  3020737026U,	// <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598154746U,	// <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
-  2598155258U,	// <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
-  1482610478U,	// <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
-  3692341398U,	// <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
-  2635851999U,	// <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
-  3636069840U,	// <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
-  2691877276U,	// <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
-  3961522690U,	// <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
-  3826797058U,	// <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
-  3703622282U,	// <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
-  3769600452U,	// <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
-  2640497430U,	// <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
-  3962194070U,	// <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
-  2232617112U,	// <0,3,4,1>: Cost 3 vrev <3,0,1,4>
-  2232690849U,	// <0,3,4,2>: Cost 3 vrev <3,0,2,4>
-  4095314332U,	// <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
-  3962194434U,	// <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
-  2691877378U,	// <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
-  3826765110U,	// <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
-  3665941518U,	// <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
-  2691877405U,	// <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
-  3630112870U,	// <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
-  3630113526U,	// <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
-  4035199734U,	// <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
-  3769600578U,	// <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
-  2232846516U,	// <0,3,5,4>: Cost 3 vrev <3,0,4,5>
-  3779037780U,	// <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
-  2718714461U,	// <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
-  2706106975U,	// <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
-  2233141464U,	// <0,3,5,u>: Cost 3 vrev <3,0,u,5>
-  2691877496U,	// <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
-  3727511914U,	// <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
-  3765619338U,	// <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
-  3765619347U,	// <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
-  3765987996U,	// <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
-  3306670270U,	// <0,3,6,5>: Cost 4 vrev <3,0,5,6>
-  3792456365U,	// <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
-  2706770608U,	// <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
-  2706844345U,	// <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
-  3769600707U,	// <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
-  2659742787U,	// <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
-  3636102612U,	// <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
-  3769600740U,	// <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
-  3769600747U,	// <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
-  3769600758U,	// <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
-  3659993400U,	// <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
-  3781176065U,	// <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
-  2664388218U,	// <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
-  1482653798U,	// <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
-  1482654460U,	// <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
-  2556397160U,	// <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
-  3021179292U,	// <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482657078U,	// <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
-  3021179394U,	// <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598203898U,	// <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
-  2708097874U,	// <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
-  1482659630U,	// <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
-  2617278468U,	// <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
-  2618605670U,	// <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2618605734U,	// <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
-  3642091695U,	// <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
-  2753134796U,	// <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
-  2718714770U,	// <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
-  3021245750U,	// <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  3665982483U,	// <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
-  3021245768U,	// <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2568355942U,	// <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
-  3692348212U,	// <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
-  3692348310U,	// <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
-  2568358064U,	// <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
-  2568359222U,	// <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
-  1812778294U,	// <0,4,1,5>: Cost 2 vzipl LHS, RHS
-  3022671158U,	// <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
-  2592248852U,	// <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1812778537U,	// <0,4,1,u>: Cost 2 vzipl LHS, RHS
-  2568364134U,	// <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
-  2238573423U,	// <0,4,2,1>: Cost 3 vrev <4,0,1,2>
-  3692349032U,	// <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
-  2631214761U,	// <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
-  2568367414U,	// <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
-  2887028022U,	// <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
-  1946996022U,	// <0,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U,	// <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1946996040U,	// <0,4,2,u>: Cost 2 vtrnl LHS, RHS
-  3692349590U,	// <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
-  3826878614U,	// <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
-  3826878625U,	// <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
-  3692349852U,	// <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
-  3692349954U,	// <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
-  3826878978U,	// <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
-  4095200566U,	// <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
-  3713583814U,	// <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
-  3692350238U,	// <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
-  2550464552U,	// <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
-  3962194914U,	// <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
-  3693677631U,	// <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
-  3642124467U,	// <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
-  2718715088U,	// <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
-  2618608950U,	// <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
-  2753137974U,	// <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
-  3666015255U,	// <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
-  2618609193U,	// <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
-  2568388710U,	// <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
-  2568389526U,	// <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
-  3636159963U,	// <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
-  2568390836U,	// <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
-  2568391990U,	// <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
-  2718715180U,	// <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
-  1618136374U,	// <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592281624U,	// <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
-  1618136392U,	// <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2550480938U,	// <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
-  3826880801U,	// <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
-  2562426332U,	// <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
-  3786190181U,	// <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
-  2718715252U,	// <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
-  3826881165U,	// <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
-  2712669568U,	// <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
-  2657760081U,	// <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
-  2718715284U,	// <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
-  3654090854U,	// <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
-  3934229326U,	// <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
-  3734156437U,	// <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
-  3734820070U,	// <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
-  3654094134U,	// <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
-  2713259464U,	// <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2713333201U,	// <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
-  3654095866U,	// <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
-  2713259464U,	// <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2568413286U,	// <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
-  2618611502U,	// <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2753140526U,	// <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
-  2568415415U,	// <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
-  2568416566U,	// <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
-  1817423158U,	// <0,4,u,5>: Cost 2 vzipl LHS, RHS
-  1947438390U,	// <0,4,u,6>: Cost 2 vtrnl LHS, RHS
-  2592306203U,	// <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
-  1947438408U,	// <0,4,u,u>: Cost 2 vtrnl LHS, RHS
-  3630219264U,	// <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
-  2625912934U,	// <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  3692355748U,	// <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
-  3693019384U,	// <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
-  3630222646U,	// <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
-  3699655062U,	// <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
-  2718715508U,	// <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
-  3087011126U,	// <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
-  2625913501U,	// <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
-  1500659814U,	// <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
-  2886520528U,	// <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
-  2574403176U,	// <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
-  2574403734U,	// <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
-  1500662674U,	// <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
-  2886520836U,	// <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
-  2886520930U,	// <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
-  2718715600U,	// <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
-  1500665646U,	// <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
-  2556493926U,	// <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
-  2244546120U,	// <0,5,2,1>: Cost 3 vrev <5,0,1,2>
-  3692357256U,	// <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
-  2568439994U,	// <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
-  2556497206U,	// <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
-  3020738564U,	// <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
-  4027877161U,	// <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
-  3093220662U,	// <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3093220663U,	// <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3699656854U,	// <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
-  3699656927U,	// <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
-  3699657006U,	// <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
-  3699657116U,	// <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
-  2637859284U,	// <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
-  3790319453U,	// <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
-  3699657354U,	// <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
-  2716725103U,	// <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
-  2716798840U,	// <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
-  2661747602U,	// <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
-  3630252810U,	// <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
-  3636225507U,	// <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
-  3716910172U,	// <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
-  3962195892U,	// <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
-  2625916214U,	// <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  3718901071U,	// <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
-  2718715846U,	// <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
-  2625916457U,	// <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
-  3791278034U,	// <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
-  3791351771U,	// <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
-  3318386260U,	// <0,5,5,2>: Cost 4 vrev <5,0,2,5>
-  3791499245U,	// <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
-  3318533734U,	// <0,5,5,4>: Cost 4 vrev <5,0,4,5>
-  2718715908U,	// <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
-  2657767522U,	// <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
-  2718715928U,	// <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
-  2718715937U,	// <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
-  2592358502U,	// <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
-  3792015404U,	// <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
-  3731509754U,	// <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
-  3785748546U,	// <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
-  2592361782U,	// <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
-  2592362594U,	// <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
-  3785748576U,	// <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
-  1644974178U,	// <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
-  1645047915U,	// <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
-  2562506854U,	// <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
-  2562507670U,	// <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
-  2562508262U,	// <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
-  3636250774U,	// <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
-  2562510134U,	// <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
-  2718716072U,	// <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
-  2718716074U,	// <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
-  2719379635U,	// <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
-  2562512686U,	// <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
-  1500717158U,	// <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
-  2625918766U,	// <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  2719674583U,	// <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
-  2568489152U,	// <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
-  1500720025U,	// <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
-  2625919130U,	// <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  2586407243U,	// <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
-  1646301444U,	// <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
-  1646375181U,	// <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
-  2586411110U,	// <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
-  2619949158U,	// <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2619949220U,	// <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
-  3785748789U,	// <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
-  2619949386U,	// <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
-  2586415202U,	// <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
-  2586415436U,	// <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
-  2952793398U,	// <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
-  2619949725U,	// <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562531430U,	// <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
-  3693691700U,	// <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
-  2886521338U,	// <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
-  3693691864U,	// <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
-  2562534710U,	// <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
-  2580450932U,	// <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
-  2886521656U,	// <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
-  2966736182U,	// <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  2966736183U,	// <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
-  1500741734U,	// <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
-  2250518817U,	// <0,6,2,1>: Cost 3 vrev <6,0,1,2>
-  2574485096U,	// <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
-  2631894694U,	// <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
-  1500744604U,	// <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
-  2574487248U,	// <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
-  3020739384U,	// <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
-  2954136886U,	// <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
-  1500747566U,	// <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
-  3693693078U,	// <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
-  3705637136U,	// <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
-  3705637192U,	// <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
-  3693693340U,	// <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
-  2637867477U,	// <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
-  3705637424U,	// <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
-  3666154056U,	// <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
-  2722697800U,	// <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
-  2722771537U,	// <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
-  2562556006U,	// <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
-  4095316257U,	// <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
-  2562557420U,	// <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
-  3636299926U,	// <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
-  2562559286U,	// <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
-  2619952438U,	// <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2723287696U,	// <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
-  4027895094U,	// <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
-  2619952681U,	// <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
-  2718716594U,	// <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3648250774U,	// <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
-  3792458436U,	// <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
-  3705638767U,	// <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
-  3648252831U,	// <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
-  3797619416U,	// <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
-  3792458472U,	// <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
-  4035202358U,	// <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
-  2718716594U,	// <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3786412796U,	// <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
-  3792458504U,	// <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
-  3728200126U,	// <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
-  3798135575U,	// <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
-  3786412836U,	// <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
-  3792458543U,	// <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
-  2718716728U,	// <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
-  2718716738U,	// <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
-  2718716747U,	// <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
-  2718716750U,	// <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
-  2724909910U,	// <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
-  3636323823U,	// <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
-  2725057384U,	// <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
-  2718716790U,	// <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
-  2718716800U,	// <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
-  3792458629U,	// <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
-  2725352332U,	// <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
-  2718716822U,	// <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
-  1500790886U,	// <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
-  2619954990U,	// <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562590192U,	// <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
-  2725721017U,	// <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
-  1500793762U,	// <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
-  2619955354U,	// <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2725942228U,	// <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
-  2954186038U,	// <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
-  1500796718U,	// <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
-  2256401391U,	// <0,7,0,0>: Cost 3 vrev <7,0,0,0>
-  2632564838U,	// <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256548865U,	// <0,7,0,2>: Cost 3 vrev <7,0,2,0>
-  3700998396U,	// <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
-  2718716952U,	// <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
-  2718716962U,	// <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
-  2621284845U,	// <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
-  3904685542U,	// <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
-  2632565405U,	// <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256409584U,	// <0,7,1,0>: Cost 3 vrev <7,0,0,1>
-  3706307380U,	// <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
-  2632565654U,	// <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
-  3769603168U,	// <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
-  2256704532U,	// <0,7,1,4>: Cost 3 vrev <7,0,4,1>
-  3769603184U,	// <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
-  3700999366U,	// <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
-  2886522476U,	// <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
-  2256999480U,	// <0,7,1,u>: Cost 3 vrev <7,0,u,1>
-  2586501222U,	// <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
-  1182749690U,	// <0,7,2,1>: Cost 2 vrev <7,0,1,2>
-  3636356595U,	// <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
-  2727711916U,	// <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
-  2586504502U,	// <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
-  2632566606U,	// <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
-  2586505559U,	// <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
-  3020740204U,	// <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
-  1183265849U,	// <0,7,2,u>: Cost 2 vrev <7,0,u,2>
-  3701000342U,	// <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
-  3706308849U,	// <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
-  3330315268U,	// <0,7,3,2>: Cost 4 vrev <7,0,2,3>
-  3706309020U,	// <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
-  3706309122U,	// <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
-  3712281127U,	// <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
-  2639202936U,	// <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  3802412321U,	// <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
-  2640530202U,	// <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
-  3654287462U,	// <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
-  2256507900U,	// <0,7,4,1>: Cost 3 vrev <7,0,1,4>
-  2256581637U,	// <0,7,4,2>: Cost 3 vrev <7,0,2,4>
-  3660262008U,	// <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
-  3786413405U,	// <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
-  2632568118U,	// <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  3718917457U,	// <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
-  3787003255U,	// <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
-  2632568361U,	// <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
-  3706310268U,	// <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
-  3792459156U,	// <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
-  3330331654U,	// <0,7,5,2>: Cost 4 vrev <7,0,2,5>
-  3722899255U,	// <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
-  2256737304U,	// <0,7,5,4>: Cost 3 vrev <7,0,4,5>
-  3724226521U,	// <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
-  2718717377U,	// <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
-  2729997763U,	// <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
-  2720044499U,	// <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
-  3712946517U,	// <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
-  2256524286U,	// <0,7,6,1>: Cost 3 vrev <7,0,1,6>
-  3792459246U,	// <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
-  3796440567U,	// <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
-  3654307126U,	// <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
-  2656457394U,	// <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
-  3792459281U,	// <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
-  2730661396U,	// <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
-  2658448293U,	// <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
-  3787003431U,	// <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
-  3654312854U,	// <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
-  3654313446U,	// <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
-  3804771905U,	// <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
-  3654315318U,	// <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
-  3654315651U,	// <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
-  3660288348U,	// <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
-  2718717548U,	// <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
-  2664420990U,	// <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
-  2256466935U,	// <0,7,u,0>: Cost 3 vrev <7,0,0,u>
-  1182798848U,	// <0,7,u,1>: Cost 2 vrev <7,0,1,u>
-  2256614409U,	// <0,7,u,2>: Cost 3 vrev <7,0,2,u>
-  2731693714U,	// <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
-  2256761883U,	// <0,7,u,4>: Cost 3 vrev <7,0,4,u>
-  2632571034U,	// <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  2669066421U,	// <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
-  2731988662U,	// <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
-  1183315007U,	// <0,7,u,u>: Cost 2 vrev <7,0,u,u>
-  135053414U,	// <0,u,0,0>: Cost 1 vdup0 LHS
-  1544896614U,	// <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1678999654U,	// <0,u,0,2>: Cost 2 vuzpl LHS, LHS
-  2691880677U,	// <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
-  1476988214U,	// <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
-  2718791419U,	// <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
-  3021248666U,	// <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2592535607U,	// <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
-  135053414U,	// <0,u,0,u>: Cost 1 vdup0 LHS
-  1476993097U,	// <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
-  1812780846U,	// <0,u,1,1>: Cost 2 vzipl LHS, LHS
-  1618138926U,	// <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2752742134U,	// <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  1476996406U,	// <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
-  1812781210U,	// <0,u,1,5>: Cost 2 vzipl LHS, RHS
-  2887006416U,	// <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
-  2966736200U,	// <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  1812781413U,	// <0,u,1,u>: Cost 2 vzipl LHS, LHS
-  1482973286U,	// <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
-  1482973987U,	// <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
-  1946998574U,	// <0,u,2,2>: Cost 2 vtrnl LHS, LHS
-  835584U,	// <0,u,2,3>: Cost 0 copy LHS
-  1482976566U,	// <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
-  3020781631U,	// <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
-  1946998938U,	// <0,u,2,6>: Cost 2 vtrnl LHS, RHS
-  1518810169U,	// <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
-  835584U,	// <0,u,2,u>: Cost 0 copy LHS
-  2618640534U,	// <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
-  2752743574U,	// <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  2636556597U,	// <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
-  2752743836U,	// <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618640898U,	// <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
-  2752743938U,	// <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  2639202936U,	// <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  2639874762U,	// <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
-  2752743637U,	// <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562703462U,	// <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
-  2888455982U,	// <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021575982U,	// <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  2568677591U,	// <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
-  2562706742U,	// <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
-  1544899894U,	// <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679002934U,	// <0,u,4,6>: Cost 2 vuzpl LHS, RHS
-  2718718033U,	// <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
-  1679002952U,	// <0,u,4,u>: Cost 2 vuzpl LHS, RHS
-  2568683622U,	// <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
-  2568684438U,	// <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
-  3765622902U,	// <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
-  2691881087U,	// <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
-  2568686902U,	// <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
-  2650492890U,	// <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
-  1618139290U,	// <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2824834358U,	// <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
-  1618139308U,	// <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592579686U,	// <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
-  2262496983U,	// <0,u,6,1>: Cost 3 vrev <u,0,1,6>
-  2654474688U,	// <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
-  2691881168U,	// <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
-  2592582966U,	// <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
-  2656465587U,	// <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
-  2657129220U,	// <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
-  1584051029U,	// <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
-  1584714662U,	// <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
-  2562728038U,	// <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
-  2562728854U,	// <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
-  2562729473U,	// <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
-  2661111018U,	// <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
-  2562731318U,	// <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
-  2718718258U,	// <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
-  2586620261U,	// <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
-  2657793644U,	// <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
-  2562733870U,	// <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
-  135053414U,	// <0,u,u,0>: Cost 1 vdup0 LHS
-  1544902446U,	// <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1679005486U,	// <0,u,u,2>: Cost 2 vuzpl LHS, LHS
-  835584U,	// <0,u,u,3>: Cost 0 copy LHS
-  1483025718U,	// <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
-  1544902810U,	// <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679005850U,	// <0,u,u,6>: Cost 2 vuzpl LHS, RHS
-  1518859327U,	// <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
-  835584U,	// <0,u,u,u>: Cost 0 copy LHS
-  2689744896U,	// <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
-  1610694666U,	// <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
-  2689744916U,	// <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
-  2619310332U,	// <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
-  2684657701U,	// <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
-  2620637598U,	// <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
-  3708977654U,	// <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
-  3666351168U,	// <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
-  1611210825U,	// <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
-  2556780646U,	// <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
-  2556781355U,	// <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
-  1616003174U,	// <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  3693052888U,	// <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
-  2556783926U,	// <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
-  2580672143U,	// <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
-  2724839566U,	// <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
-  3654415354U,	// <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
-  1616003228U,	// <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
-  2685690019U,	// <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
-  2685763756U,	// <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
-  2698297524U,	// <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
-  2685911230U,	// <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
-  2689745100U,	// <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
-  3764814038U,	// <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
-  2724839640U,	// <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
-  2592625658U,	// <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
-  2686279915U,	// <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
-  3087843328U,	// <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  3087843338U,	// <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
-  67944550U,	// <1,0,3,2>: Cost 1 vrev LHS
-  2568743135U,	// <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
-  2562772278U,	// <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
-  4099850454U,	// <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
-  3704998538U,	// <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
-  2592633923U,	// <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
-  68386972U,	// <1,0,3,u>: Cost 1 vrev LHS
-  2620640146U,	// <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
-  2689745234U,	// <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
-  2689745244U,	// <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
-  3760980320U,	// <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
-  3761054057U,	// <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
-  2619313462U,	// <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  3761201531U,	// <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
-  3666383940U,	// <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
-  2619313705U,	// <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
-  4029300736U,	// <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
-  2895249510U,	// <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
-  3028287590U,	// <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3642501345U,	// <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
-  2215592058U,	// <1,0,5,4>: Cost 3 vrev <0,1,4,5>
-  3724242907U,	// <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
-  3724906540U,	// <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
-  3911118134U,	// <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
-  3028287644U,	// <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3762086375U,	// <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
-  2698297846U,	// <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
-  3760022015U,	// <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
-  3642509538U,	// <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
-  3762381323U,	// <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
-  3730215604U,	// <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
-  3730879237U,	// <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
-  2657801046U,	// <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
-  2658464679U,	// <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
-  2659128312U,	// <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
-  4047898278U,	// <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
-  2215460970U,	// <1,0,7,2>: Cost 3 vrev <0,1,2,7>
-  3734861035U,	// <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
-  3731543398U,	// <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
-  3736188301U,	// <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
-  2663110110U,	// <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
-  3731543660U,	// <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
-  2664437376U,	// <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
-  3087884288U,	// <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  1616003730U,	// <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
-  67985515U,	// <1,0,u,2>: Cost 1 vrev LHS
-  2689893028U,	// <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
-  2689745586U,	// <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
-  2619316378U,	// <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  2669082807U,	// <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
-  2592674888U,	// <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
-  68427937U,	// <1,0,u,u>: Cost 1 vrev LHS
-  1543585802U,	// <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
-  1548894310U,	// <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
-  2618654892U,	// <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
-  2689745654U,	// <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
-  2622636370U,	// <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
-  2620645791U,	// <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
-  3696378367U,	// <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
-  3666424905U,	// <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
-  1548894866U,	// <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
-  1483112550U,	// <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U,	// <1,1,1,1>: Cost 1 vdup1 LHS
-  2622636950U,	// <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
-  2622637016U,	// <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
-  1483115830U,	// <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2622637200U,	// <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
-  2622637263U,	// <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
-  2592691274U,	// <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U,	// <1,1,1,u>: Cost 1 vdup1 LHS
-  2550890588U,	// <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
-  2617329183U,	// <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
-  2622637672U,	// <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
-  2622637734U,	// <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
-  2550893878U,	// <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
-  3696379744U,	// <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
-  2622638010U,	// <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
-  3804554170U,	// <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
-  2622638139U,	// <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
-  2622638230U,	// <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
-  3087844148U,	// <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
-  4161585244U,	// <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
-  2014101606U,	// <1,1,3,3>: Cost 2 vtrnr LHS, LHS
-  2622638594U,	// <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
-  2689745920U,	// <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
-  3763487753U,	// <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
-  2592707660U,	// <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
-  2014101611U,	// <1,1,3,u>: Cost 2 vtrnr LHS, LHS
-  2556878950U,	// <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
-  2221335351U,	// <1,1,4,1>: Cost 3 vrev <1,1,1,4>
-  3696380988U,	// <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
-  3763487805U,	// <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
-  2556882230U,	// <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
-  1548897590U,	// <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2758184246U,	// <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
-  3666457677U,	// <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
-  1548897833U,	// <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
-  2693653615U,	// <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
-  2617331408U,	// <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
-  4029302934U,	// <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
-  2689746064U,	// <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
-  2221564755U,	// <1,1,5,4>: Cost 3 vrev <1,1,4,5>
-  2955559250U,	// <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
-  2617331810U,	// <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
-  2825293110U,	// <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  2689746109U,	// <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
-  3696382241U,	// <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
-  2689746127U,	// <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
-  2617332218U,	// <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
-  3763487969U,	// <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
-  3696382605U,	// <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
-  4029309266U,	// <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
-  2617332536U,	// <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
-  2724840702U,	// <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
-  2725504263U,	// <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
-  2617332720U,	// <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
-  2659800138U,	// <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  3691074717U,	// <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
-  4167811174U,	// <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
-  2617333094U,	// <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
-  3295396702U,	// <1,1,7,5>: Cost 4 vrev <1,1,5,7>
-  3803891014U,	// <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
-  2617333356U,	// <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
-  2659800138U,	// <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  1483112550U,	// <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U,	// <1,1,u,1>: Cost 1 vdup1 LHS
-  2622642056U,	// <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
-  2014142566U,	// <1,1,u,3>: Cost 2 vtrnr LHS, LHS
-  1483115830U,	// <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  1548900506U,	// <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2622642384U,	// <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
-  2825293353U,	// <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  202162278U,	// <1,1,u,u>: Cost 1 vdup1 LHS
-  2635251712U,	// <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
-  1561509990U,	// <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
-  2618663085U,	// <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
-  2696529358U,	// <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
-  2635252050U,	// <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
-  3769533926U,	// <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
-  2621317617U,	// <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
-  2659140170U,	// <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
-  1561510557U,	// <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
-  2623308516U,	// <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
-  2635252532U,	// <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
-  2631271318U,	// <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
-  2958180454U,	// <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
-  2550959414U,	// <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
-  2635252880U,	// <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
-  2635252952U,	// <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
-  3732882731U,	// <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
-  2958180459U,	// <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
-  2629281213U,	// <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
-  2635253280U,	// <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
-  2618664552U,	// <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
-  2689746546U,	// <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
-  3764815485U,	// <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
-  3760023176U,	// <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
-  2635253690U,	// <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
-  2659141610U,	// <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
-  2689746591U,	// <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
-  403488870U,	// <1,2,3,0>: Cost 1 vext1 LHS, LHS
-  1477231350U,	// <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477232232U,	// <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477233052U,	// <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
-  403492150U,	// <1,2,3,4>: Cost 1 vext1 LHS, RHS
-  1525010128U,	// <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1525010938U,	// <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525011450U,	// <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403494702U,	// <1,2,3,u>: Cost 1 vext1 LHS, LHS
-  2641226607U,	// <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
-  3624723446U,	// <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
-  3301123609U,	// <1,2,4,2>: Cost 4 vrev <2,1,2,4>
-  2598759198U,	// <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
-  2659142864U,	// <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
-  1561513270U,	// <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  2659143028U,	// <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
-  2659143112U,	// <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
-  1561513513U,	// <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
-  2550988902U,	// <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
-  2550989824U,	// <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
-  3624732264U,	// <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
-  2955559014U,	// <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2550992182U,	// <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
-  2659143684U,	// <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
-  2659143778U,	// <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
-  2659143848U,	// <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
-  2550994734U,	// <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
-  2700289945U,	// <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
-  2635256232U,	// <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
-  2659144186U,	// <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
-  2689746874U,	// <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
-  3763488705U,	// <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
-  3763488716U,	// <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
-  2659144504U,	// <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
-  2657817432U,	// <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
-  2689746919U,	// <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
-  1585402874U,	// <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
-  2659144770U,	// <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
-  3708998858U,	// <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
-  2635257059U,	// <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
-  2659145062U,	// <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
-  3732886916U,	// <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
-  3732886998U,	// <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
-  2659145255U,	// <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
-  1590711938U,	// <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
-  403529835U,	// <1,2,u,0>: Cost 1 vext1 LHS, LHS
-  1477272310U,	// <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477273192U,	// <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477273750U,	// <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
-  403533110U,	// <1,2,u,4>: Cost 1 vext1 LHS, RHS
-  1561516186U,	// <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  1525051898U,	// <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525052410U,	// <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403535662U,	// <1,2,u,u>: Cost 1 vext1 LHS, LHS
-  2819407872U,	// <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
-  1551564902U,	// <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
-  2819408630U,	// <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
-  2619334911U,	// <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
-  2625306962U,	// <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
-  3832725879U,	// <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
-  3699048959U,	// <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
-  3776538827U,	// <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
-  1551565469U,	// <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
-  2618671862U,	// <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
-  2819408692U,	// <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
-  2624643975U,	// <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
-  1745666150U,	// <1,3,1,3>: Cost 2 vuzpr LHS, LHS
-  2557005110U,	// <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
-  2625307792U,	// <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
-  3698386127U,	// <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
-  2592838748U,	// <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
-  1745666155U,	// <1,3,1,u>: Cost 2 vuzpr LHS, LHS
-  2819408790U,	// <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2625308193U,	// <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
-  2819408036U,	// <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819851890U,	// <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819408794U,	// <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  3893149890U,	// <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
-  2819408076U,	// <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  3772041583U,	// <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
-  2819408042U,	// <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  1483276390U,	// <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
-  1483277128U,	// <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
-  2557019752U,	// <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
-  2819408856U,	// <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
-  1483279670U,	// <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
-  2819409614U,	// <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2598826490U,	// <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
-  3087844352U,	// <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
-  1483282222U,	// <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
-  2568970342U,	// <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
-  2568971224U,	// <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
-  3832761290U,	// <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
-  2233428219U,	// <1,3,4,3>: Cost 3 vrev <3,1,3,4>
-  2568973622U,	// <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
-  1551568182U,	// <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410434U,	// <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
-  3666605151U,	// <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
-  1551568425U,	// <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
-  2563006566U,	// <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
-  2568979456U,	// <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
-  2563008035U,	// <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
-  2233436412U,	// <1,3,5,3>: Cost 3 vrev <3,1,3,5>
-  2563009846U,	// <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
-  2867187716U,	// <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
-  2655834214U,	// <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
-  1745669430U,	// <1,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1745669431U,	// <1,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2867187810U,	// <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
-  3699052931U,	// <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
-  2654507460U,	// <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
-  3766291091U,	// <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
-  2655834726U,	// <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
-  3923384562U,	// <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
-  2657161992U,	// <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
-  2819852218U,	// <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819852219U,	// <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  2706926275U,	// <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
-  2659816524U,	// <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
-  3636766245U,	// <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
-  2867187903U,	// <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
-  2625312102U,	// <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
-  2867188598U,	// <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
-  3728250344U,	// <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
-  2867187880U,	// <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
-  2707516171U,	// <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
-  1483317350U,	// <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
-  1483318093U,	// <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
-  2819410718U,	// <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
-  1745666717U,	// <1,3,u,3>: Cost 2 vuzpr LHS, LHS
-  1483320630U,	// <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
-  1551571098U,	// <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410758U,	// <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
-  1745669673U,	// <1,3,u,7>: Cost 2 vuzpr LHS, RHS
-  1745666722U,	// <1,3,u,u>: Cost 2 vuzpr LHS, LHS
-  2617352205U,	// <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
-  2619342950U,	// <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  3692421295U,	// <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
-  2619343104U,	// <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2617352530U,	// <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
-  1634880402U,	// <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
-  2713930652U,	// <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
-  3732898396U,	// <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
-  1635101613U,	// <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
-  3693085430U,	// <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
-  2623988535U,	// <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
-  3693085590U,	// <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
-  3692422134U,	// <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
-  3693085726U,	// <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
-  2892401974U,	// <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
-  3026619702U,	// <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  3800206324U,	// <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
-  2892402217U,	// <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
-  3966978927U,	// <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
-  3966979018U,	// <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
-  3693086312U,	// <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
-  2635269798U,	// <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
-  3966979280U,	// <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
-  2893204790U,	// <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  3693086650U,	// <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
-  3666662502U,	// <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
-  2893205033U,	// <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
-  2563063910U,	// <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
-  2563064730U,	// <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
-  2563065386U,	// <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
-  3693087132U,	// <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
-  2619345410U,	// <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
-  3087843666U,	// <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
-  3087843676U,	// <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
-  3666670695U,	// <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
-  3087843669U,	// <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
-  2620672914U,	// <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
-  3630842706U,	// <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
-  3313069003U,	// <1,4,4,2>: Cost 4 vrev <4,1,2,4>
-  3642788100U,	// <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
-  2713930960U,	// <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
-  2619346230U,	// <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
-  2713930980U,	// <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
-  3736882642U,	// <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
-  2619346473U,	// <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
-  2557108326U,	// <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
-  2557109075U,	// <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
-  2598913774U,	// <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
-  3630852246U,	// <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
-  2557111606U,	// <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
-  2895252790U,	// <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616006454U,	// <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  3899059510U,	// <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
-  1616006472U,	// <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2557116518U,	// <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
-  2557117236U,	// <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
-  3630859880U,	// <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
-  2569062550U,	// <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
-  2557119798U,	// <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
-  3763490174U,	// <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
-  3763490183U,	// <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
-  2712751498U,	// <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  2557122350U,	// <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
-  2659161084U,	// <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
-  3732903040U,	// <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
-  3734230174U,	// <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
-  3734893807U,	// <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
-  3660729654U,	// <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
-  3786493384U,	// <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
-  2713341394U,	// <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
-  3660731386U,	// <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
-  2664470148U,	// <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
-  2557132902U,	// <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
-  2619348782U,	// <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  2563106351U,	// <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
-  2713783816U,	// <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
-  2622666815U,	// <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
-  1640189466U,	// <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
-  1616006697U,	// <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  2712751498U,	// <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  1616006715U,	// <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2620014592U,	// <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
-  1546272870U,	// <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2618687664U,	// <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
-  3693093120U,	// <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
-  1546273106U,	// <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2620678563U,	// <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
-  2714668660U,	// <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
-  3772042877U,	// <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
-  1546273437U,	// <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620015350U,	// <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
-  2620015412U,	// <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
-  2620015510U,	// <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
-  2618688512U,	// <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
-  2620015677U,	// <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
-  2620015727U,	// <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
-  2620015859U,	// <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
-  3093728566U,	// <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
-  2620015981U,	// <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
-  3692430816U,	// <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
-  2620016163U,	// <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
-  2620016232U,	// <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
-  2620016294U,	// <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
-  3693758221U,	// <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
-  3692431209U,	// <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
-  2620016570U,	// <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
-  4173598006U,	// <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
-  2620016699U,	// <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
-  2620016790U,	// <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
-  2569110672U,	// <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
-  3693758785U,	// <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
-  2620017052U,	// <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
-  2620017154U,	// <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
-  3135623172U,	// <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
-  4161587048U,	// <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
-  2014104886U,	// <1,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2014104887U,	// <1,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2620017554U,	// <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
-  2620017634U,	// <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  3693759551U,	// <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
-  3642861837U,	// <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
-  2575092710U,	// <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
-  1546276150U,	// <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2759855414U,	// <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
-  2713931718U,	// <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
-  1546276393U,	// <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
-  2557182054U,	// <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
-  2557182812U,	// <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
-  3630925347U,	// <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
-  4029301675U,	// <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
-  2557185334U,	// <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
-  2713931780U,	// <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
-  2667794530U,	// <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
-  2713931800U,	// <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
-  2557187886U,	// <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
-  2718208036U,	// <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
-  2620019115U,	// <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
-  2667794938U,	// <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
-  3787673666U,	// <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
-  3693761165U,	// <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
-  3319279297U,	// <1,5,6,5>: Cost 4 vrev <5,1,5,6>
-  2667795256U,	// <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
-  2713931874U,	// <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
-  2713931883U,	// <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
-  2557198438U,	// <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
-  2557199156U,	// <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
-  2569143974U,	// <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
-  2569144592U,	// <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
-  2557201718U,	// <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
-  2713931944U,	// <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
-  3787673770U,	// <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
-  2719387828U,	// <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
-  2557204270U,	// <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
-  2620020435U,	// <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
-  1546278702U,	// <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620020616U,	// <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
-  2620020668U,	// <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
-  1594054682U,	// <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
-  1546279066U,	// <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2620020944U,	// <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
-  2014145846U,	// <1,5,u,7>: Cost 2 vtrnr LHS, RHS
-  2014145847U,	// <1,5,u,u>: Cost 2 vtrnr LHS, RHS
-  3692437504U,	// <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
-  2618695782U,	// <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  2618695857U,	// <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
-  3794161970U,	// <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
-  2620023122U,	// <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
-  2620686756U,	// <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
-  2621350389U,	// <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
-  4028599606U,	// <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
-  2618696349U,	// <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
-  3692438262U,	// <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
-  2625995572U,	// <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
-  3692438422U,	// <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
-  3692438488U,	// <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
-  2625995820U,	// <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
-  3692438672U,	// <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
-  3692438720U,	// <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
-  2958183734U,	// <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  2958183735U,	// <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
-  2721526201U,	// <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
-  3692439097U,	// <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
-  3692439144U,	// <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
-  3692439206U,	// <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
-  3636948278U,	// <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
-  3787674092U,	// <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
-  2618697658U,	// <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
-  2970799414U,	// <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2970799415U,	// <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
-  2563211366U,	// <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
-  3699738854U,	// <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
-  2563212860U,	// <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
-  3692439964U,	// <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
-  2563214646U,	// <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
-  4191820018U,	// <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
-  2587103648U,	// <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
-  3087845306U,	// <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  3087845307U,	// <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
-  3693767570U,	// <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
-  3693767650U,	// <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
-  3636962877U,	// <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
-  3325088134U,	// <1,6,4,3>: Cost 4 vrev <6,1,3,4>
-  3693767898U,	// <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
-  2618699062U,	// <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  3833670966U,	// <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
-  4028632374U,	// <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
-  2618699305U,	// <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
-  3693768264U,	// <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
-  3630998373U,	// <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
-  3636971070U,	// <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
-  3642943767U,	// <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
-  3693768628U,	// <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
-  3732918276U,	// <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
-  2620690530U,	// <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
-  2955562294U,	// <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
-  2955562295U,	// <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
-  2724180733U,	// <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
-  3631006566U,	// <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
-  3631007674U,	// <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
-  3692442184U,	// <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
-  3631009078U,	// <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
-  3787674416U,	// <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
-  2713932600U,	// <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
-  2713932610U,	// <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
-  2713932619U,	// <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
-  1651102542U,	// <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
-  2724918103U,	// <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
-  2698302306U,	// <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
-  3642960153U,	// <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
-  2713932662U,	// <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
-  2725213051U,	// <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
-  2724844426U,	// <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
-  4035956022U,	// <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
-  1651692438U,	// <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
-  1651766175U,	// <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
-  2618701614U,	// <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  3135663508U,	// <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
-  3692443580U,	// <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
-  2713932743U,	// <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
-  2618701978U,	// <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  2622683344U,	// <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
-  3087886266U,	// <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  1652356071U,	// <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
-  2726171632U,	// <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
-  2626666598U,	// <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  3695100067U,	// <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
-  3707044102U,	// <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
-  2726466580U,	// <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
-  3654921933U,	// <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
-  2621358582U,	// <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
-  2622022215U,	// <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
-  2626667165U,	// <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
-  2593128550U,	// <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
-  2626667316U,	// <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
-  3700409238U,	// <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
-  2257294428U,	// <1,7,1,3>: Cost 3 vrev <7,1,3,1>
-  2593131830U,	// <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
-  2626667646U,	// <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
-  2627331279U,	// <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
-  2593133696U,	// <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
-  2628658545U,	// <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
-  2587164774U,	// <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
-  3701073445U,	// <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
-  3700409960U,	// <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
-  2638612134U,	// <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
-  2587168054U,	// <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
-  3706382167U,	// <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
-  2587169192U,	// <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
-  3660911610U,	// <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
-  2587170606U,	// <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
-  1507459174U,	// <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
-  2569257984U,	// <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
-  2581202536U,	// <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
-  2569259294U,	// <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
-  1507462454U,	// <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
-  1507462864U,	// <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
-  2581205498U,	// <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
-  2581206010U,	// <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
-  1507465006U,	// <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
-  2728826164U,	// <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
-  3654951732U,	// <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
-  3330987094U,	// <1,7,4,2>: Cost 4 vrev <7,1,2,4>
-  3331060831U,	// <1,7,4,3>: Cost 4 vrev <7,1,3,4>
-  3787674971U,	// <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
-  2626669878U,	// <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
-  3785979241U,	// <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
-  3787085176U,	// <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
-  2626670121U,	// <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
-  2569273446U,	// <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
-  2569274368U,	// <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
-  3643016808U,	// <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
-  2569275680U,	// <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
-  2569276726U,	// <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
-  4102034790U,	// <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
-  2651222067U,	// <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
-  3899378998U,	// <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
-  2569279278U,	// <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
-  2730153430U,	// <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
-  2724845022U,	// <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
-  3643025338U,	// <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
-  3643025697U,	// <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
-  3643026742U,	// <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
-  3654971091U,	// <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
-  3787675153U,	// <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
-  2724845076U,	// <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
-  2725508637U,	// <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
-  2730817063U,	// <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
-  3631088436U,	// <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
-  3660949158U,	// <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
-  3801904705U,	// <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
-  3631090998U,	// <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
-  2662503828U,	// <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
-  3660951981U,	// <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
-  2713933420U,	// <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
-  2731406959U,	// <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
-  1507500134U,	// <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
-  2626672430U,	// <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  2581243496U,	// <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
-  2569300259U,	// <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
-  1507503414U,	// <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
-  1507503829U,	// <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
-  2581246458U,	// <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
-  2581246970U,	// <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
-  1507505966U,	// <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
-  1543643153U,	// <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
-  1546297446U,	// <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
-  2819448852U,	// <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
-  2619375876U,	// <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
-  1546297685U,	// <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
-  1658771190U,	// <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
-  2736789248U,	// <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
-  2659189376U,	// <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
-  1546298013U,	// <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
-  1483112550U,	// <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U,	// <1,u,1,1>: Cost 1 vdup1 LHS
-  1616009006U,	// <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  1745707110U,	// <1,u,1,3>: Cost 2 vuzpr LHS, LHS
-  1483115830U,	// <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2620040336U,	// <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
-  3026622618U,	// <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  2958183752U,	// <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  202162278U,	// <1,u,1,u>: Cost 1 vdup1 LHS
-  2819449750U,	// <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2893207342U,	// <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
-  2819448996U,	// <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819450482U,	// <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819449754U,	// <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  2893207706U,	// <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  2819449036U,	// <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  2970799432U,	// <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2819449002U,	// <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  403931292U,	// <1,u,3,0>: Cost 1 vext1 LHS, LHS
-  1477673718U,	// <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  115726126U,	// <1,u,3,2>: Cost 1 vrev LHS
-  2014102173U,	// <1,u,3,3>: Cost 2 vtrnr LHS, LHS
-  403934518U,	// <1,u,3,4>: Cost 1 vext1 LHS, RHS
-  1507536601U,	// <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
-  1525453306U,	// <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  2014105129U,	// <1,u,3,7>: Cost 2 vtrnr LHS, RHS
-  403937070U,	// <1,u,3,u>: Cost 1 vext1 LHS, LHS
-  2620042157U,	// <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
-  2620042237U,	// <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
-  2263217967U,	// <1,u,4,2>: Cost 3 vrev <u,1,2,4>
-  2569341224U,	// <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
-  2569342262U,	// <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
-  1546300726U,	// <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  2819449180U,	// <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
-  2724845649U,	// <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
-  1546300969U,	// <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
-  2551431270U,	// <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
-  2551432192U,	// <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
-  3028293422U,	// <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  2955559068U,	// <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2551434550U,	// <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
-  2895255706U,	// <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616009370U,	// <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710390U,	// <1,u,5,7>: Cost 2 vuzpr LHS, RHS
-  1745710391U,	// <1,u,5,u>: Cost 2 vuzpr LHS, RHS
-  2653221159U,	// <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
-  2725509303U,	// <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
-  2659193338U,	// <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
-  2689751248U,	// <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
-  2867228774U,	// <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
-  3764820194U,	// <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
-  2657202957U,	// <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
-  2819450810U,	// <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819450811U,	// <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  1585452032U,	// <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
-  2557420340U,	// <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
-  2569365158U,	// <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
-  2569365803U,	// <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
-  2557422902U,	// <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
-  2662512021U,	// <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
-  2724845884U,	// <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
-  2659194476U,	// <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
-  1590761096U,	// <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
-  403972257U,	// <1,u,u,0>: Cost 1 vext1 LHS, LHS
-  202162278U,	// <1,u,u,1>: Cost 1 vdup1 LHS
-  115767091U,	// <1,u,u,2>: Cost 1 vrev LHS
-  1745707677U,	// <1,u,u,3>: Cost 2 vuzpr LHS, LHS
-  403975478U,	// <1,u,u,4>: Cost 1 vext1 LHS, RHS
-  1546303642U,	// <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  1616009613U,	// <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710633U,	// <1,u,u,7>: Cost 2 vuzpr LHS, RHS
-  403978030U,	// <1,u,u,u>: Cost 1 vext1 LHS, LHS
-  2551463936U,	// <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
-  2685698058U,	// <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
-  1610776596U,	// <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
-  2619384069U,	// <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
-  2551467318U,	// <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
-  3899836596U,	// <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
-  2621374968U,	// <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
-  4168271334U,	// <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
-  1611219018U,	// <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
-  2551472138U,	// <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
-  2690564186U,	// <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
-  1611956326U,	// <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2826092646U,	// <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
-  2551475510U,	// <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
-  3692463248U,	// <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
-  2587308473U,	// <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
-  3661050874U,	// <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
-  1611956380U,	// <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  1477738598U,	// <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
-  2551481078U,	// <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
-  2551481796U,	// <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
-  2551482518U,	// <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
-  1477741878U,	// <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
-  2551484112U,	// <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
-  2551484759U,	// <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
-  2551485434U,	// <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
-  1477744430U,	// <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
-  2953625600U,	// <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2953627302U,	// <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  2953625764U,	// <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
-  4027369695U,	// <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
-  3625233718U,	// <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
-  3899836110U,	// <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
-  4032012618U,	// <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
-  3899835392U,	// <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
-  2953625770U,	// <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
-  2551496806U,	// <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
-  2685698386U,	// <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
-  2685698396U,	// <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
-  3625240726U,	// <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
-  2551500086U,	// <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
-  2618723638U,	// <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765409590U,	// <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  3799990664U,	// <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
-  2685698450U,	// <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
-  3625246822U,	// <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
-  3289776304U,	// <2,0,5,1>: Cost 4 vrev <0,2,1,5>
-  2690564526U,	// <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
-  3289923778U,	// <2,0,5,3>: Cost 4 vrev <0,2,3,5>
-  2216255691U,	// <2,0,5,4>: Cost 3 vrev <0,2,4,5>
-  3726307332U,	// <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
-  3726307426U,	// <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
-  2826095926U,	// <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  2216550639U,	// <2,0,5,u>: Cost 3 vrev <0,2,u,5>
-  4162420736U,	// <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
-  2901885030U,	// <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
-  2685698559U,	// <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
-  3643173171U,	// <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
-  2216263884U,	// <2,0,6,4>: Cost 3 vrev <0,2,4,6>
-  3730289341U,	// <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
-  3726308152U,	// <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
-  3899836346U,	// <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
-  2216558832U,	// <2,0,6,u>: Cost 3 vrev <0,2,u,6>
-  2659202049U,	// <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  3726308437U,	// <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
-  2726249034U,	// <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
-  3734934772U,	// <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
-  3726308710U,	// <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
-  3726308814U,	// <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
-  3736925671U,	// <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
-  3726308972U,	// <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
-  2659202049U,	// <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  1477787750U,	// <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
-  2953668262U,	// <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  1611956893U,	// <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2551531670U,	// <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
-  1477791030U,	// <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
-  2618726554U,	// <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765412506U,	// <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  2826096169U,	// <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  1611956947U,	// <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  2569453670U,	// <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
-  2619392102U,	// <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
-  3759440619U,	// <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
-  1616823030U,	// <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
-  2569456950U,	// <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
-  2690712328U,	// <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
-  3661115841U,	// <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
-  2622046794U,	// <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
-  1617191715U,	// <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
-  2551545958U,	// <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
-  2685698868U,	// <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
-  2628682646U,	// <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
-  2685698888U,	// <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
-  2551549238U,	// <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
-  3693134992U,	// <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
-  3661124034U,	// <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
-  3625292794U,	// <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
-  2685698933U,	// <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
-  2551554150U,	// <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
-  3893649571U,	// <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
-  2551555688U,	// <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
-  2685698966U,	// <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
-  2551557430U,	// <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
-  3763422123U,	// <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
-  3693135802U,	// <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
-  2726249402U,	// <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
-  2685699011U,	// <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
-  2551562342U,	// <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
-  2953625610U,	// <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953627798U,	// <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  2953626584U,	// <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
-  2551565622U,	// <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
-  2953625938U,	// <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U,	// <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  4032013519U,	// <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
-  2953625617U,	// <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
-  2690565154U,	// <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
-  3625313270U,	// <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
-  3771532340U,	// <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
-  1148404634U,	// <2,1,4,3>: Cost 2 vrev <1,2,3,4>
-  3625315638U,	// <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
-  2619395382U,	// <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
-  3837242678U,	// <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
-  3799991394U,	// <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
-  1148773319U,	// <2,1,4,u>: Cost 2 vrev <1,2,u,4>
-  2551578726U,	// <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
-  2551579648U,	// <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
-  3625321952U,	// <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
-  2685699216U,	// <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
-  2551582006U,	// <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
-  3740913668U,	// <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
-  3661156806U,	// <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
-  3893652790U,	// <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
-  2685699261U,	// <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
-  2551586918U,	// <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
-  3625329398U,	// <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
-  2551588794U,	// <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
-  3088679014U,	// <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
-  2551590198U,	// <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
-  4029382994U,	// <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
-  3625333560U,	// <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
-  3731624800U,	// <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
-  2551592750U,	// <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
-  2622051322U,	// <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
-  3733615699U,	// <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
-  3795125538U,	// <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
-  2222171037U,	// <2,1,7,3>: Cost 3 vrev <1,2,3,7>
-  3740915046U,	// <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
-  3296060335U,	// <2,1,7,5>: Cost 4 vrev <1,2,5,7>
-  3736933864U,	// <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
-  3805300055U,	// <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
-  2669827714U,	// <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
-  2551603302U,	// <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
-  2953666570U,	// <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953668758U,	// <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  1148437406U,	// <2,1,u,3>: Cost 2 vrev <1,2,3,u>
-  2551606582U,	// <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
-  2953666898U,	// <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U,	// <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  2669828370U,	// <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
-  1148806091U,	// <2,1,u,u>: Cost 2 vrev <1,2,u,u>
-  1543667732U,	// <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
-  1548976230U,	// <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  2685699524U,	// <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
-  2685699535U,	// <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
-  2551614774U,	// <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
-  3704422830U,	// <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
-  3893657642U,	// <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
-  3770574323U,	// <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
-  1548976796U,	// <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
-  2622718710U,	// <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
-  2622718772U,	// <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
-  2622718870U,	// <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
-  2819915878U,	// <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
-  3625364790U,	// <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
-  2622719120U,	// <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
-  3760031292U,	// <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
-  3667170468U,	// <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
-  2819915883U,	// <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
-  1489829990U,	// <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  2563572470U,	// <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
-  269271142U,	// <2,2,2,2>: Cost 1 vdup2 LHS
-  2685699698U,	// <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
-  1489833270U,	// <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  2685699720U,	// <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
-  2622719930U,	// <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
-  2593436837U,	// <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U,	// <2,2,2,u>: Cost 1 vdup2 LHS
-  2685699750U,	// <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
-  2690565806U,	// <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
-  2953627240U,	// <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
-  1879883878U,	// <2,2,3,3>: Cost 2 vzipr LHS, LHS
-  2685699790U,	// <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
-  3893659342U,	// <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
-  2958270812U,	// <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2593445030U,	// <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
-  1879883883U,	// <2,2,3,u>: Cost 2 vzipr LHS, LHS
-  2551644262U,	// <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
-  3625386742U,	// <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
-  2551645902U,	// <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
-  3759441686U,	// <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
-  2551647542U,	// <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
-  1548979510U,	// <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2764901686U,	// <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
-  3667195047U,	// <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
-  1548979753U,	// <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
-  3696463432U,	// <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
-  2617413328U,	// <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
-  2685699936U,	// <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
-  4027383910U,	// <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
-  2228201085U,	// <2,2,5,4>: Cost 3 vrev <2,2,4,5>
-  2617413636U,	// <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
-  2617413730U,	// <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
-  2819919158U,	// <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  2819919159U,	// <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
-  3625402554U,	// <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
-  3760031652U,	// <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
-  2617414138U,	// <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
-  2685700026U,	// <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
-  3625405750U,	// <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
-  3760031692U,	// <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
-  3088679116U,	// <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
-  2657891169U,	// <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
-  2685700071U,	// <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
-  2726250474U,	// <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
-  3704427616U,	// <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
-  2660545701U,	// <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
-  4030718054U,	// <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
-  2617415014U,	// <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
-  3302033032U,	// <2,2,7,5>: Cost 4 vrev <2,2,5,7>
-  3661246929U,	// <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
-  2617415276U,	// <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
-  2731558962U,	// <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
-  1489829990U,	// <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  1548982062U,	// <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  269271142U,	// <2,2,u,2>: Cost 1 vdup2 LHS
-  1879924838U,	// <2,2,u,3>: Cost 2 vzipr LHS, LHS
-  1489833270U,	// <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  1548982426U,	// <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2953666908U,	// <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2819919401U,	// <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  269271142U,	// <2,2,u,u>: Cost 1 vdup2 LHS
-  1544339456U,	// <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470597734U,	// <2,3,0,1>: Cost 1 vext2 LHS, LHS
-  1548984484U,	// <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2619408648U,	// <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
-  1548984658U,	// <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665857454U,	// <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  2622726655U,	// <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2593494188U,	// <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
-  470598301U,	// <2,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544340214U,	// <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544340276U,	// <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544340374U,	// <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1548985304U,	// <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2551696694U,	// <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
-  1548985488U,	// <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2622727375U,	// <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
-  2665858347U,	// <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
-  1548985709U,	// <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  2622727613U,	// <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
-  2622727711U,	// <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  1544341096U,	// <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544341158U,	// <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  2622727958U,	// <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
-  2622728032U,	// <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
-  1548986298U,	// <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2665859050U,	// <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
-  1548986427U,	// <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1548986518U,	// <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2622728415U,	// <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
-  1489913458U,	// <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
-  1544341916U,	// <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
-  1548986882U,	// <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2665859632U,	// <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
-  2234304870U,	// <2,3,3,6>: Cost 3 vrev <3,2,6,3>
-  2958271632U,	// <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  1548987166U,	// <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
-  1483948134U,	// <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
-  1483948954U,	// <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
-  2622729276U,	// <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2557692054U,	// <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
-  1483951414U,	// <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
-  470601014U,	// <2,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592118644U,	// <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2593526960U,	// <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
-  470601257U,	// <2,3,4,u>: Cost 1 vext2 LHS, RHS
-  2551726182U,	// <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
-  1592118992U,	// <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2665860862U,	// <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
-  2551728642U,	// <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
-  1592119238U,	// <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592119300U,	// <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592119394U,	// <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1592119464U,	// <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1592119545U,	// <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
-  2622730529U,	// <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2557707164U,	// <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
-  1592119802U,	// <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2665861682U,	// <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
-  2622730893U,	// <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  2665861810U,	// <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
-  1592120120U,	// <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592120142U,	// <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1592120223U,	// <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
-  1592120314U,	// <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659890261U,	// <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
-  2660553894U,	// <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
-  2665862371U,	// <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592120678U,	// <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665862534U,	// <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2665862614U,	// <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
-  1592120940U,	// <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592120962U,	// <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1548990163U,	// <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  470603566U,	// <2,3,u,1>: Cost 1 vext2 LHS, LHS
-  1548990341U,	// <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  1548990396U,	// <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
-  1548990527U,	// <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  470603930U,	// <2,3,u,5>: Cost 1 vext2 LHS, RHS
-  1548990672U,	// <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1592121600U,	// <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
-  470604133U,	// <2,3,u,u>: Cost 1 vext2 LHS, LHS
-  2617425942U,	// <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
-  2618753126U,	// <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2618753208U,	// <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
-  2619416841U,	// <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
-  2587593628U,	// <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
-  2712832914U,	// <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
-  1634962332U,	// <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  3799993252U,	// <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
-  1634962332U,	// <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  2619417334U,	// <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
-  3692495668U,	// <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
-  2625389466U,	// <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
-  2826125414U,	// <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
-  3699794995U,	// <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
-  3692496016U,	// <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
-  3763424238U,	// <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
-  3667317942U,	// <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
-  2826125419U,	// <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
-  2629371336U,	// <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
-  3699131946U,	// <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
-  2630698602U,	// <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
-  2618754766U,	// <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
-  2826126234U,	// <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
-  2899119414U,	// <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
-  3033337142U,	// <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
-  3800214597U,	// <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
-  2899119657U,	// <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
-  2635344033U,	// <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
-  4032012325U,	// <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
-  3692497228U,	// <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
-  3692497308U,	// <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
-  3001404624U,	// <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
-  2953627342U,	// <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2953625804U,	// <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3899868160U,	// <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
-  2953625806U,	// <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  2710916266U,	// <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
-  3899869648U,	// <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
-  3899869658U,	// <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
-  3899868930U,	// <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
-  2712833232U,	// <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
-  2618756406U,	// <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
-  2765737270U,	// <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
-  4168304426U,	// <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
-  2618756649U,	// <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
-  2551800011U,	// <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
-  2569716470U,	// <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
-  2563745405U,	// <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
-  2569718102U,	// <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
-  2551803190U,	// <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
-  3625545732U,	// <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
-  1611959606U,	// <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128694U,	// <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959624U,	// <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478066278U,	// <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
-  2551808758U,	// <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
-  2551809516U,	// <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
-  2551810198U,	// <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
-  1478069558U,	// <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
-  2901888310U,	// <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  2551812920U,	// <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
-  2726251914U,	// <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
-  1478072110U,	// <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
-  2659234821U,	// <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  3786722726U,	// <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
-  3734303911U,	// <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
-  3734967544U,	// <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
-  3727005030U,	// <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
-  2726251976U,	// <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
-  2726251986U,	// <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
-  3727005292U,	// <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
-  2659234821U,	// <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  1478082662U,	// <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
-  2618758958U,	// <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2551826024U,	// <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
-  2551826582U,	// <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
-  1478085942U,	// <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
-  2953668302U,	// <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  1611959849U,	// <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128937U,	// <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959867U,	// <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  3691839488U,	// <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
-  2618097766U,	// <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620088484U,	// <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
-  2619425034U,	// <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
-  2620088667U,	// <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
-  2620752300U,	// <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
-  3693830655U,	// <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
-  3094531382U,	// <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  2618098333U,	// <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  3691840246U,	// <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
-  3691840308U,	// <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
-  2626061206U,	// <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
-  2618098688U,	// <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
-  2626061364U,	// <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
-  3691840656U,	// <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
-  3789082310U,	// <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
-  2712833744U,	// <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
-  2628715896U,	// <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
-  3693831613U,	// <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
-  4026698642U,	// <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
-  2632033896U,	// <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
-  3691841190U,	// <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
-  2632034061U,	// <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
-  3691841352U,	// <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
-  3691841466U,	// <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
-  3088354614U,	// <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  3088354615U,	// <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
-  2557829222U,	// <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
-  2557830059U,	// <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
-  2575746766U,	// <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
-  3691841948U,	// <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
-  2619427330U,	// <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
-  2581720847U,	// <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
-  2953628162U,	// <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953626624U,	// <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2953626625U,	// <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
-  2569781350U,	// <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
-  3631580076U,	// <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
-  2569782990U,	// <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
-  2569783646U,	// <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
-  2569784630U,	// <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
-  2618101046U,	// <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  3893905922U,	// <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
-  3094564150U,	// <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  2618101289U,	// <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
-  2551873638U,	// <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
-  3637560320U,	// <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
-  3637560966U,	// <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
-  3723030343U,	// <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
-  2551876918U,	// <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
-  2712834052U,	// <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
-  4028713474U,	// <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
-  2712834072U,	// <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
-  2712834081U,	// <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
-  2575769702U,	// <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
-  3631596462U,	// <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
-  2655924730U,	// <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
-  3643541856U,	// <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
-  2655924849U,	// <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
-  3787755607U,	// <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
-  4029385218U,	// <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
-  3088682294U,	// <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
-  3088682295U,	// <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
-  2563833958U,	// <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
-  2551890678U,	// <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
-  2563835528U,	// <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
-  3637577878U,	// <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
-  2563837238U,	// <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
-  2712834216U,	// <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
-  2712834220U,	// <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
-  4174449974U,	// <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
-  2563839790U,	// <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
-  2563842150U,	// <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
-  2618103598U,	// <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2563843721U,	// <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
-  2569816418U,	// <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
-  2622748735U,	// <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
-  2618103962U,	// <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  2953669122U,	// <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953667584U,	// <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2618104165U,	// <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620096512U,	// <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
-  1546354790U,	// <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620096676U,	// <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
-  3693838588U,	// <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
-  1546355036U,	// <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
-  3694502317U,	// <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
-  2551911246U,	// <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
-  2720723287U,	// <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
-  1546355357U,	// <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620097270U,	// <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
-  2620097332U,	// <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
-  2620097430U,	// <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
-  2820243558U,	// <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2620097598U,	// <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
-  2620097680U,	// <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
-  3693839585U,	// <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
-  2721386920U,	// <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
-  2820243563U,	// <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2714014137U,	// <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
-  2712834500U,	// <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
-  2620098152U,	// <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
-  2620098214U,	// <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
-  2632042254U,	// <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
-  2712834540U,	// <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
-  2820243660U,	// <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
-  2958265654U,	// <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
-  2620098619U,	// <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
-  2620098710U,	// <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
-  3893986982U,	// <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
-  2569848762U,	// <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
-  2620098972U,	// <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
-  2620099074U,	// <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
-  3893987022U,	// <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
-  3001404644U,	// <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1879887158U,	// <2,6,3,7>: Cost 2 vzipr LHS, RHS
-  1879887159U,	// <2,6,3,u>: Cost 2 vzipr LHS, RHS
-  2620099484U,	// <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
-  2620099566U,	// <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
-  2620099644U,	// <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
-  3643599207U,	// <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
-  2575830080U,	// <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
-  1546358070U,	// <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2667875700U,	// <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
-  4028042550U,	// <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
-  1546358313U,	// <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
-  3693841992U,	// <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
-  2667876048U,	// <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
-  2712834756U,	// <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
-  3643607400U,	// <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
-  2252091873U,	// <2,6,5,4>: Cost 3 vrev <6,2,4,5>
-  2667876356U,	// <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
-  2667876450U,	// <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
-  2820246838U,	// <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2820246839U,	// <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2563899494U,	// <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
-  3893988683U,	// <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
-  2563901072U,	// <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
-  3893987236U,	// <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
-  2563902774U,	// <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
-  3893988723U,	// <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
-  2712834872U,	// <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
-  2955644214U,	// <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
-  2955644215U,	// <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
-  2712834894U,	// <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
-  2724926296U,	// <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
-  2725000033U,	// <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
-  2702365544U,	// <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
-  2712834934U,	// <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
-  3776107393U,	// <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
-  2725294981U,	// <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
-  2726253452U,	// <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
-  2712834966U,	// <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
-  2620102355U,	// <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
-  1546360622U,	// <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620102536U,	// <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
-  2820244125U,	// <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  1594136612U,	// <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
-  1546360986U,	// <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2620102864U,	// <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
-  1879928118U,	// <2,6,u,7>: Cost 2 vzipr LHS, RHS
-  1879928119U,	// <2,6,u,u>: Cost 2 vzipr LHS, RHS
-  2726179825U,	// <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
-  1652511738U,	// <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
-  2621431972U,	// <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
-  2257949868U,	// <2,7,0,3>: Cost 3 vrev <7,2,3,0>
-  2726474773U,	// <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
-  2620768686U,	// <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
-  2621432319U,	// <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
-  2599760953U,	// <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
-  1653027897U,	// <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
-  2639348470U,	// <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695174452U,	// <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
-  3695174550U,	// <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
-  3694511104U,	// <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
-  3713090594U,	// <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
-  3693184144U,	// <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
-  2627405016U,	// <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
-  3799995519U,	// <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
-  2639348470U,	// <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695175101U,	// <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
-  3643655168U,	// <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
-  2257892517U,	// <2,7,2,2>: Cost 3 vrev <7,2,2,2>
-  3695175334U,	// <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
-  3695175465U,	// <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
-  2632714080U,	// <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
-  2633377713U,	// <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
-  3695175658U,	// <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
-  2634704979U,	// <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
-  1514094694U,	// <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
-  2569921680U,	// <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
-  2587838056U,	// <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
-  2569922927U,	// <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
-  1514097974U,	// <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
-  2581868321U,	// <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
-  1514099194U,	// <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
-  2587841530U,	// <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
-  1514100526U,	// <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
-  2708706617U,	// <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
-  3649643418U,	// <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
-  3649644330U,	// <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
-  2257982640U,	// <2,7,4,3>: Cost 3 vrev <7,2,3,4>
-  3649645641U,	// <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
-  2621435190U,	// <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  2712835441U,	// <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
-  3799995762U,	// <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
-  2621435433U,	// <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
-  2729497990U,	// <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
-  3643679744U,	// <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
-  3637708424U,	// <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
-  3643681137U,	// <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
-  2599800118U,	// <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
-  3786577334U,	// <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
-  3786577345U,	// <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
-  2599802214U,	// <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
-  2599802670U,	// <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
-  2581889126U,	// <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
-  3643687936U,	// <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
-  2663240186U,	// <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
-  3643689330U,	// <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
-  2581892406U,	// <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
-  2581892900U,	// <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
-  2587865597U,	// <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
-  3786577428U,	// <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
-  2581894958U,	// <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
-  2726254119U,	// <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
-  3804640817U,	// <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
-  3637724826U,	// <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
-  3734992123U,	// <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
-  2552040758U,	// <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
-  3799995992U,	// <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
-  2663241198U,	// <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
-  2712835692U,	// <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
-  2731562607U,	// <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
-  1514135654U,	// <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
-  1657820802U,	// <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
-  2587879016U,	// <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
-  2569963892U,	// <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
-  1514138934U,	// <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
-  2621438106U,	// <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  1514140159U,	// <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
-  2587882490U,	// <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
-  1514141486U,	// <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
-  1544380416U,	// <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470638699U,	// <2,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544380580U,	// <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1658631909U,	// <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
-  1544380754U,	// <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665898414U,	// <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  1658853120U,	// <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
-  3094531625U,	// <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  470639261U,	// <2,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544381174U,	// <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544381236U,	// <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544381334U,	// <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544381400U,	// <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618123325U,	// <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544381584U,	// <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618123489U,	// <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2726254427U,	// <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
-  1544381823U,	// <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
-  1478328422U,	// <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
-  2618123807U,	// <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  269271142U,	// <2,u,2,2>: Cost 1 vdup2 LHS
-  1544382118U,	// <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1478331702U,	// <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
-  2618124136U,	// <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544382394U,	// <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  3088354857U,	// <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  269271142U,	// <2,u,2,u>: Cost 1 vdup2 LHS
-  1544382614U,	// <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2953627374U,	// <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
-  1490282143U,	// <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
-  1879883932U,	// <2,u,3,3>: Cost 2 vzipr LHS, LHS
-  1544382978U,	// <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2953627378U,	// <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
-  1514172931U,	// <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
-  1879887176U,	// <2,u,3,7>: Cost 2 vzipr LHS, RHS
-  1879883937U,	// <2,u,3,u>: Cost 2 vzipr LHS, LHS
-  1484316774U,	// <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
-  1484317639U,	// <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
-  2552088270U,	// <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
-  1190213513U,	// <2,u,4,3>: Cost 2 vrev <u,2,3,4>
-  1484320054U,	// <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
-  470641974U,	// <2,u,4,5>: Cost 1 vext2 LHS, RHS
-  1592159604U,	// <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  3094564393U,	// <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  470642217U,	// <2,u,4,u>: Cost 1 vext2 LHS, RHS
-  2552094959U,	// <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
-  1592159952U,	// <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2564040353U,	// <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
-  2690275455U,	// <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
-  1592160198U,	// <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592160260U,	// <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1611962522U,	// <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1592160424U,	// <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1611962540U,	// <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478361190U,	// <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
-  2552103670U,	// <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
-  1592160762U,	// <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2685704400U,	// <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
-  1478364470U,	// <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
-  2901891226U,	// <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  1592161080U,	// <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592161102U,	// <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1478367022U,	// <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
-  1592161274U,	// <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659931226U,	// <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
-  2564056739U,	// <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
-  2665903331U,	// <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592161638U,	// <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665903494U,	// <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2587947527U,	// <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
-  1592161900U,	// <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592161922U,	// <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1478377574U,	// <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
-  470644526U,	// <2,u,u,1>: Cost 1 vext2 LHS, LHS
-  269271142U,	// <2,u,u,2>: Cost 1 vdup2 LHS
-  1879924892U,	// <2,u,u,3>: Cost 2 vzipr LHS, LHS
-  1478380854U,	// <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
-  470644890U,	// <2,u,u,5>: Cost 1 vext2 LHS, RHS
-  1611962765U,	// <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1879928136U,	// <2,u,u,7>: Cost 2 vzipr LHS, RHS
-  470645093U,	// <2,u,u,u>: Cost 1 vext2 LHS, LHS
-  1611448320U,	// <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611890698U,	// <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611890708U,	// <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  3763576860U,	// <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
-  2689835045U,	// <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
-  3698508206U,	// <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
-  3763576887U,	// <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
-  3667678434U,	// <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
-  1616093258U,	// <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
-  1490337894U,	// <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
-  2685632602U,	// <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
-  537706598U,	// <3,0,1,2>: Cost 1 vext3 LHS, LHS
-  2624766936U,	// <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
-  1490341174U,	// <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
-  2624767120U,	// <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
-  2732966030U,	// <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
-  2593944803U,	// <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
-  537706652U,	// <3,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U,	// <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685632684U,	// <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  2685632692U,	// <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
-  2685632702U,	// <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611890892U,	// <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2732966102U,	// <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
-  2624767930U,	// <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
-  2685632744U,	// <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
-  1611890924U,	// <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2624768150U,	// <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
-  2685632764U,	// <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  2685632774U,	// <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
-  2624768412U,	// <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
-  2624768514U,	// <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
-  3702491714U,	// <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
-  2624768632U,	// <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
-  3702491843U,	// <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
-  2686959934U,	// <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
-  2689835336U,	// <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
-  1611891026U,	// <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611891036U,	// <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3763577184U,	// <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
-  2689835374U,	// <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
-  1551027510U,	// <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2666573172U,	// <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
-  3667711206U,	// <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
-  1616093586U,	// <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2685190556U,	// <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
-  2666573520U,	// <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
-  3040886886U,	// <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
-  3625912834U,	// <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
-  2666573766U,	// <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
-  2666573828U,	// <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
-  2732966354U,	// <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
-  2666573992U,	// <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
-  3040886940U,	// <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
-  2685190637U,	// <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
-  2732966390U,	// <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
-  2689835519U,	// <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
-  3667724438U,	// <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
-  3763577355U,	// <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
-  3806708243U,	// <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
-  2666574648U,	// <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
-  2657948520U,	// <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
-  2689835573U,	// <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
-  2666574842U,	// <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
-  2685633095U,	// <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
-  2660603052U,	// <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
-  3643844997U,	// <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
-  2666575206U,	// <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
-  3655790391U,	// <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
-  3731690968U,	// <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
-  2666575468U,	// <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
-  2664584850U,	// <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
-  1616093834U,	// <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
-  1611891346U,	// <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537707165U,	// <3,0,u,2>: Cost 1 vext3 LHS, LHS
-  2689835684U,	// <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1616093874U,	// <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551030426U,	// <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2624772304U,	// <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
-  2594002154U,	// <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
-  537707219U,	// <3,0,u,u>: Cost 1 vext3 LHS, LHS
-  2552201318U,	// <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
-  2618802278U,	// <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
-  2618802366U,	// <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
-  1611449078U,	// <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2552204598U,	// <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
-  2732966663U,	// <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
-  3906258396U,	// <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
-  3667752171U,	// <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
-  1611891491U,	// <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  2689835819U,	// <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
-  1611449140U,	// <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
-  2624775063U,	// <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
-  1611891528U,	// <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  2689835859U,	// <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
-  2689835868U,	// <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  3763577701U,	// <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
-  3765273452U,	// <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
-  1611891573U,	// <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
-  2629420494U,	// <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
-  2689835911U,	// <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2564163248U,	// <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
-  1611449238U,	// <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
-  2564164918U,	// <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
-  2689835947U,	// <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  3692545978U,	// <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
-  2732966842U,	// <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
-  1611891651U,	// <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
-  1484456038U,	// <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
-  1611891672U,	// <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685633502U,	// <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2685633512U,	// <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
-  1484459318U,	// <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
-  1611891712U,	// <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2689836041U,	// <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2733409294U,	// <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
-  1611891735U,	// <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  2552234086U,	// <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
-  2732966955U,	// <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
-  2732966964U,	// <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
-  2685633597U,	// <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
-  2552237366U,	// <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
-  2618805558U,	// <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
-  2769472822U,	// <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
-  3667784943U,	// <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
-  2685633642U,	// <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
-  2689836143U,	// <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
-  2564187280U,	// <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
-  2564187827U,	// <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
-  1611891856U,	// <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  2689836183U,	// <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
-  3759375522U,	// <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
-  3720417378U,	// <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
-  2832518454U,	// <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611891901U,	// <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  3763578048U,	// <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
-  2689836239U,	// <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2732967128U,	// <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
-  2685633761U,	// <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  3763578088U,	// <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
-  2689836275U,	// <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  3763578108U,	// <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
-  2732967166U,	// <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
-  2685633806U,	// <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
-  3631972454U,	// <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
-  2659947612U,	// <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
-  4036102294U,	// <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
-  3095396454U,	// <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  3631975734U,	// <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
-  2222982144U,	// <3,1,7,5>: Cost 3 vrev <1,3,5,7>
-  3296797705U,	// <3,1,7,6>: Cost 4 vrev <1,3,6,7>
-  3720418924U,	// <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
-  3095396459U,	// <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1484496998U,	// <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
-  1611892077U,	// <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
-  2685633907U,	// <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  1611892092U,	// <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
-  1484500278U,	// <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
-  1611892117U,	// <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685633950U,	// <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  2832518697U,	// <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611892140U,	// <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
-  2623455232U,	// <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
-  1549713510U,	// <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  2689836484U,	// <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
-  2685633997U,	// <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2623455570U,	// <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
-  2732967398U,	// <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
-  2689836524U,	// <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2229044964U,	// <3,2,0,7>: Cost 3 vrev <2,3,7,0>
-  1549714077U,	// <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
-  1549714166U,	// <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
-  2623456052U,	// <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
-  2623456150U,	// <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
-  2685634079U,	// <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2552286518U,	// <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
-  2623456400U,	// <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
-  2689836604U,	// <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  3667834101U,	// <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
-  1155385070U,	// <3,2,1,u>: Cost 2 vrev <2,3,u,1>
-  2689836629U,	// <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
-  2689836640U,	// <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
-  1611449960U,	// <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892338U,	// <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  2689836669U,	// <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
-  2689836680U,	// <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2689836688U,	// <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
-  3763578518U,	// <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
-  1611892383U,	// <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
-  1611450022U,	// <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
-  2685191854U,	// <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
-  2685191865U,	// <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
-  2685191875U,	// <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
-  1611450062U,	// <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
-  2732967635U,	// <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
-  2732967645U,	// <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
-  2732967652U,	// <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
-  1611450094U,	// <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
-  2558279782U,	// <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
-  2558280602U,	// <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
-  2732967692U,	// <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
-  2685634326U,	// <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2558283062U,	// <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
-  1549716790U,	// <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689836844U,	// <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
-  2229077736U,	// <3,2,4,7>: Cost 3 vrev <2,3,7,4>
-  1549717033U,	// <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
-  2552316006U,	// <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
-  2228643507U,	// <3,2,5,1>: Cost 3 vrev <2,3,1,5>
-  2689836896U,	// <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685634408U,	// <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1155122894U,	// <3,2,5,4>: Cost 2 vrev <2,3,4,5>
-  2665263108U,	// <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
-  2689836932U,	// <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2665263272U,	// <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
-  1155417842U,	// <3,2,5,u>: Cost 2 vrev <2,3,u,5>
-  2689836953U,	// <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
-  2689836964U,	// <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
-  2689836976U,	// <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
-  1611892666U,	// <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  2689836993U,	// <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
-  2689837004U,	// <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689837013U,	// <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2665263950U,	// <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
-  1611892711U,	// <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  2665264122U,	// <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
-  2623460419U,	// <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
-  4169138340U,	// <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
-  2962358374U,	// <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
-  2665264486U,	// <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
-  2228954841U,	// <3,2,7,5>: Cost 3 vrev <2,3,5,7>
-  2229028578U,	// <3,2,7,6>: Cost 3 vrev <2,3,6,7>
-  2665264748U,	// <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
-  2962358379U,	// <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
-  1611892795U,	// <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
-  1549719342U,	// <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  1611449960U,	// <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892824U,	// <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  1611892835U,	// <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
-  1549719706U,	// <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689837168U,	// <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
-  2665265408U,	// <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
-  1611892867U,	// <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
-  2685192331U,	// <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
-  1611450518U,	// <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
-  2685634717U,	// <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
-  2564294806U,	// <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
-  2685634736U,	// <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
-  2732968122U,	// <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
-  3763579075U,	// <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
-  4034053264U,	// <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
-  1611450581U,	// <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
-  2685192415U,	// <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
-  1550385992U,	// <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
-  2685192433U,	// <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
-  2685634808U,	// <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
-  2558332214U,	// <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
-  2685634828U,	// <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
-  3759376661U,	// <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
-  2703477022U,	// <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
-  1555031423U,	// <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
-  2564309094U,	// <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
-  2630100513U,	// <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
-  1557022322U,	// <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
-  2685192520U,	// <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
-  2564312374U,	// <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
-  2732968286U,	// <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
-  2685634918U,	// <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
-  2704140655U,	// <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
-  1561004120U,	// <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
-  1496547430U,	// <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  2624129256U,	// <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
-  2630764866U,	// <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
-  336380006U,	// <3,3,3,3>: Cost 1 vdup3 LHS
-  1496550710U,	// <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  2732968368U,	// <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
-  2624129683U,	// <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
-  2594182400U,	// <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
-  336380006U,	// <3,3,3,u>: Cost 1 vdup3 LHS
-  2558353510U,	// <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
-  2558354411U,	// <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
-  2564327108U,	// <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
-  2564327938U,	// <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
-  2960343962U,	// <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
-  1611893250U,	// <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
-  2771619126U,	// <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
-  4034086032U,	// <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
-  1611893277U,	// <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
-  2558361702U,	// <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
-  2558362604U,	// <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
-  2558363342U,	// <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
-  2732968512U,	// <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
-  2558364982U,	// <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
-  3101279950U,	// <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
-  2665934946U,	// <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
-  2826636598U,	// <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2826636599U,	// <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2732968568U,	// <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
-  3763579521U,	// <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
-  2732968586U,	// <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
-  2732968595U,	// <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
-  2732968604U,	// <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
-  3763579557U,	// <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
-  2732968621U,	// <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
-  2657973099U,	// <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
-  2658636732U,	// <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
-  2558378086U,	// <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
-  2558378990U,	// <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
-  2564351687U,	// <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
-  2661291264U,	// <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
-  2558381366U,	// <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
-  2732968694U,	// <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
-  3781126907U,	// <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
-  3095397376U,	// <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
-  2558383918U,	// <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
-  1496547430U,	// <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  1611893534U,	// <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
-  1592858504U,	// <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
-  336380006U,	// <3,3,u,3>: Cost 1 vdup3 LHS
-  1496550710U,	// <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  1611893574U,	// <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
-  2690280268U,	// <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
-  2826636841U,	// <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  336380006U,	// <3,3,u,u>: Cost 1 vdup3 LHS
-  2624798720U,	// <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
-  1551056998U,	// <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624798884U,	// <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
-  3693232384U,	// <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
-  2624799058U,	// <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
-  1659227026U,	// <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
-  1659227036U,	// <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
-  3667973382U,	// <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
-  1551057565U,	// <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624799478U,	// <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
-  2624799540U,	// <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
-  1551057818U,	// <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
-  2624799704U,	// <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
-  2564377910U,	// <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
-  2689838050U,	// <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
-  2689838062U,	// <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2628117807U,	// <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
-  1555039616U,	// <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
-  3626180710U,	// <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
-  2624800298U,	// <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
-  2624800360U,	// <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
-  2624800422U,	// <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
-  2624800514U,	// <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
-  2709965878U,	// <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
-  2689838140U,	// <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
-  2634090504U,	// <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
-  2689838158U,	// <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
-  2624800918U,	// <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
-  2636081403U,	// <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
-  2636745036U,	// <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
-  2624801180U,	// <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
-  2624801232U,	// <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
-  2905836854U,	// <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
-  3040054582U,	// <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
-  3702524611U,	// <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
-  2624801566U,	// <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
-  2564399206U,	// <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
-  2564400026U,	// <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
-  2564400845U,	// <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
-  2570373542U,	// <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  1659227344U,	// <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1551060278U,	// <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  1659227364U,	// <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
-  3668006154U,	// <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
-  1551060521U,	// <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
-  1490665574U,	// <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
-  2689838341U,	// <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1490667214U,	// <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
-  2564409494U,	// <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
-  1490668854U,	// <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
-  2689838381U,	// <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
-  537709878U,	// <3,4,5,6>: Cost 1 vext3 LHS, RHS
-  2594272523U,	// <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
-  537709896U,	// <3,4,5,u>: Cost 1 vext3 LHS, RHS
-  2689838411U,	// <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
-  2558444534U,	// <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
-  2666607098U,	// <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
-  2558446082U,	// <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
-  1659227508U,	// <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2689838462U,	// <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  2689838471U,	// <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
-  2657981292U,	// <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
-  1659227540U,	// <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
-  2666607610U,	// <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
-  3702527072U,	// <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
-  2660635824U,	// <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
-  3644139945U,	// <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
-  2666607974U,	// <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
-  2732969416U,	// <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
-  2732969425U,	// <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
-  2666608236U,	// <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
-  2664617622U,	// <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
-  1490690150U,	// <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
-  1551062830U,	// <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  1490691793U,	// <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
-  2624804796U,	// <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
-  1490693430U,	// <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
-  1551063194U,	// <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  537710121U,	// <3,4,u,6>: Cost 1 vext3 LHS, RHS
-  2594297102U,	// <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
-  537710139U,	// <3,4,u,u>: Cost 1 vext3 LHS, RHS
-  3692576768U,	// <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
-  2618835046U,	// <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
-  2618835138U,	// <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
-  3692577024U,	// <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
-  2689838690U,	// <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
-  2732969579U,	// <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2732969588U,	// <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
-  2246963055U,	// <3,5,0,7>: Cost 3 vrev <5,3,7,0>
-  2618835613U,	// <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
-  2594308198U,	// <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
-  3692577588U,	// <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
-  2624807835U,	// <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
-  2625471468U,	// <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
-  2626135101U,	// <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
-  2594311888U,	// <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
-  3699877107U,	// <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
-  1641680592U,	// <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
-  1641754329U,	// <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
-  3692578274U,	// <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
-  2630116899U,	// <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
-  3692578408U,	// <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
-  2625472206U,	// <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
-  2632107798U,	// <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
-  2715938575U,	// <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
-  3692578746U,	// <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
-  2716086049U,	// <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
-  2634762330U,	// <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
-  3692578966U,	// <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
-  2636089596U,	// <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
-  3699214668U,	// <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
-  2638080412U,	// <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
-  2618837506U,	// <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
-  2832844494U,	// <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
-  4033415682U,	// <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
-  3095072054U,	// <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
-  3095072055U,	// <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
-  2600304742U,	// <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
-  3763580815U,	// <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
-  2564474582U,	// <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
-  3699879044U,	// <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
-  2600308022U,	// <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
-  2618838326U,	// <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
-  2772454710U,	// <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1659228102U,	// <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
-  1659228111U,	// <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
-  2570453094U,	// <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
-  2624810704U,	// <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
-  2570454734U,	// <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
-  2570455472U,	// <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
-  2570456374U,	// <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
-  1659228164U,	// <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  2732969998U,	// <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
-  1659228184U,	// <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
-  1659228193U,	// <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
-  2732970020U,	// <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
-  2732970035U,	// <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
-  2564490968U,	// <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
-  2732970050U,	// <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
-  2732970060U,	// <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
-  2732970071U,	// <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
-  2732970080U,	// <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
-  1659228258U,	// <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
-  1659228267U,	// <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
-  1484783718U,	// <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484784640U,	// <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
-  2558527080U,	// <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
-  2558527638U,	// <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
-  1484786998U,	// <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
-  1659228328U,	// <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2732970154U,	// <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
-  2558531180U,	// <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
-  1484789550U,	// <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484791910U,	// <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
-  1484792833U,	// <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
-  2558535272U,	// <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
-  2558535830U,	// <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
-  1484795190U,	// <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
-  1659228409U,	// <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
-  2772457626U,	// <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1646326023U,	// <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
-  1484797742U,	// <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
-  2558541926U,	// <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
-  2689839393U,	// <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
-  2689839404U,	// <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
-  3706519808U,	// <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
-  2689839420U,	// <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
-  2732970314U,	// <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
-  2732970316U,	// <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
-  2960313654U,	// <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  2689839456U,	// <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
-  3763581290U,	// <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
-  3763581297U,	// <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
-  2624816028U,	// <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
-  3763581315U,	// <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
-  2626143294U,	// <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
-  3763581335U,	// <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
-  2721321376U,	// <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
-  2721395113U,	// <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
-  2628797826U,	// <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
-  2594390118U,	// <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
-  2721616324U,	// <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
-  2630788725U,	// <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
-  3763581395U,	// <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
-  2632115991U,	// <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
-  2632779624U,	// <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
-  2594394618U,	// <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
-  1648316922U,	// <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
-  1648390659U,	// <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
-  3693914262U,	// <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
-  3638281176U,	// <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
-  3696568678U,	// <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
-  2638088604U,	// <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
-  2632780290U,	// <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
-  3712494145U,	// <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
-  3698559612U,	// <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
-  2959674678U,	// <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  2959674679U,	// <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
-  3763581536U,	// <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
-  2722943590U,	// <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
-  2732970609U,	// <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
-  3698560147U,	// <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
-  2732970628U,	// <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
-  2689839757U,	// <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
-  2732970640U,	// <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
-  2960346422U,	// <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
-  2689839784U,	// <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
-  2576498790U,	// <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
-  3650241270U,	// <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
-  2732970692U,	// <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
-  2576501250U,	// <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  2576501906U,	// <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
-  3650244622U,	// <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
-  4114633528U,	// <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
-  2732970735U,	// <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
-  2576504622U,	// <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
-  2732970749U,	// <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
-  2724270856U,	// <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
-  2624819706U,	// <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
-  3656223234U,	// <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
-  2732970788U,	// <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
-  2732970800U,	// <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
-  1659228984U,	// <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659228994U,	// <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
-  1659229003U,	// <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
-  1659229006U,	// <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
-  2558600201U,	// <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
-  2558601146U,	// <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
-  2725081963U,	// <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
-  1659229046U,	// <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
-  2715423611U,	// <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
-  2722059141U,	// <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
-  2962361654U,	// <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
-  1659229078U,	// <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
-  1659229087U,	// <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
-  2689840041U,	// <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
-  2558609339U,	// <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
-  2576525853U,	// <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
-  1659229127U,	// <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
-  2689840081U,	// <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
-  1659228984U,	// <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1652298720U,	// <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
-  1659229159U,	// <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
-  2626813952U,	// <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
-  1553072230U,	// <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814116U,	// <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
-  3700556028U,	// <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
-  2626814290U,	// <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
-  2582507375U,	// <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
-  2588480072U,	// <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
-  2732971055U,	// <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
-  1553072797U,	// <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814710U,	// <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
-  2626814772U,	// <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
-  2626814870U,	// <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
-  2625487854U,	// <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
-  2582514998U,	// <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
-  1553073296U,	// <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
-  2627478753U,	// <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
-  2727367810U,	// <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
-  1555064195U,	// <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
-  2588491878U,	// <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
-  3700557318U,	// <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
-  2626815592U,	// <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
-  2626815654U,	// <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
-  2588495158U,	// <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
-  2632787817U,	// <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
-  1559709626U,	// <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
-  2728031443U,	// <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
-  1561036892U,	// <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
-  2626816150U,	// <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
-  2626816268U,	// <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
-  2633451878U,	// <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
-  2626816412U,	// <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
-  2626816514U,	// <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
-  2638760514U,	// <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
-  2639424147U,	// <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
-  2826961920U,	// <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
-  2626816798U,	// <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
-  2582536294U,	// <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
-  2582537360U,	// <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
-  2588510138U,	// <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
-  3700558996U,	// <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
-  2582539574U,	// <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
-  1553075510U,	// <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  2588512844U,	// <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
-  2564625766U,	// <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
-  1553075753U,	// <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
-  2732971398U,	// <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
-  2626817744U,	// <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
-  3700559649U,	// <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
-  2626817903U,	// <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
-  2258728203U,	// <3,7,5,4>: Cost 3 vrev <7,3,4,5>
-  2732971446U,	// <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
-  2732971457U,	// <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
-  2826964278U,	// <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2826964279U,	// <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2732971478U,	// <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
-  2732971486U,	// <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
-  2633454074U,	// <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
-  2633454152U,	// <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
-  2732971518U,	// <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
-  2732971526U,	// <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
-  2732971537U,	// <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
-  2732971540U,	// <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
-  2726041124U,	// <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
-  2570616934U,	// <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
-  2570617856U,	// <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
-  2564646635U,	// <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
-  2570619332U,	// <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
-  2570620214U,	// <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
-  2582564726U,	// <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
-  2588537423U,	// <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
-  1659229804U,	// <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1659229804U,	// <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
-  2626819795U,	// <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
-  1553078062U,	// <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626819973U,	// <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
-  2826961565U,	// <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
-  2626820159U,	// <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
-  1553078426U,	// <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  1595545808U,	// <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
-  1659229804U,	// <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1553078629U,	// <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  1611448320U,	// <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611896531U,	// <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
-  1659672284U,	// <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
-  1616099045U,	// <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  2685638381U,	// <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
-  1663874806U,	// <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
-  1663874816U,	// <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
-  2960313672U,	// <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  1611896594U,	// <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
-  1549763324U,	// <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
-  1550426957U,	// <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
-  537712430U,	// <3,u,1,2>: Cost 1 vext3 LHS, LHS
-  1616541495U,	// <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
-  1490930998U,	// <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
-  1553081489U,	// <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
-  2627486946U,	// <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
-  1659230043U,	// <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
-  537712484U,	// <3,u,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U,	// <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2624833102U,	// <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
-  1557063287U,	// <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
-  1616099205U,	// <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
-  1611890892U,	// <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2689841054U,	// <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
-  1559717819U,	// <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
-  1659230124U,	// <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
-  1616541618U,	// <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
-  1611896764U,	// <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
-  1484973079U,	// <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
-  2685638607U,	// <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
-  336380006U,	// <3,u,3,3>: Cost 1 vdup3 LHS
-  1611896804U,	// <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
-  1616541679U,	// <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  2690283512U,	// <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
-  2959674696U,	// <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  336380006U,	// <3,u,3,u>: Cost 1 vdup3 LHS
-  2558722150U,	// <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
-  1659672602U,	// <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
-  1659672612U,	// <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  2689841196U,	// <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
-  1659227344U,	// <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1611896895U,	// <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
-  1663875144U,	// <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
-  1659230289U,	// <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
-  1611896922U,	// <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
-  1490960486U,	// <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
-  2689841261U,	// <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
-  1490962162U,	// <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
-  1616541823U,	// <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1490963766U,	// <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
-  1659228164U,	// <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  537712794U,	// <3,u,5,6>: Cost 1 vext3 LHS, RHS
-  1659230371U,	// <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
-  537712812U,	// <3,u,5,u>: Cost 1 vext3 LHS, RHS
-  2689841327U,	// <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
-  2558739482U,	// <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
-  2689841351U,	// <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
-  1616099536U,	// <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1659227508U,	// <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2690283746U,	// <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
-  1659228984U,	// <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659230445U,	// <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
-  1616099581U,	// <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
-  1485004902U,	// <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
-  1485005851U,	// <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
-  2558748264U,	// <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
-  3095397021U,	// <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1485008182U,	// <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
-  1659228328U,	// <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2722060599U,	// <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
-  1659229804U,	// <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1485010734U,	// <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
-  1616099665U,	// <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
-  1611897179U,	// <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
-  537712997U,	// <3,u,u,2>: Cost 1 vext3 LHS, LHS
-  336380006U,	// <3,u,u,3>: Cost 1 vdup3 LHS
-  1616099705U,	// <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
-  1611897219U,	// <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
-  537713037U,	// <3,u,u,6>: Cost 1 vext3 LHS, RHS
-  1659230607U,	// <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
-  537713051U,	// <3,u,u,u>: Cost 1 vext3 LHS, LHS
-  2691907584U,	// <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
-  2691907594U,	// <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
-  2691907604U,	// <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
-  3709862144U,	// <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
-  2684682280U,	// <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
-  3694600633U,	// <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
-  3291431290U,	// <4,0,0,6>: Cost 4 vrev <0,4,6,0>
-  3668342067U,	// <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
-  2691907657U,	// <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
-  2570715238U,	// <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
-  2570716058U,	// <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
-  1618165862U,	// <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570717648U,	// <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
-  2570718518U,	// <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
-  2594607206U,	// <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
-  3662377563U,	// <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
-  2594608436U,	// <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
-  1618165916U,	// <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2685714598U,	// <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
-  3759530159U,	// <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
-  2685862072U,	// <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
-  2631476937U,	// <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
-  2685714636U,	// <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
-  3765649622U,	// <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
-  2686157020U,	// <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  3668358453U,	// <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
-  2686304494U,	// <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
-  3632529510U,	// <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
-  2686451968U,	// <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2686525705U,	// <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
-  3760341266U,	// <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
-  3632532790U,	// <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
-  3913254606U,	// <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
-  3705219740U,	// <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
-  3713845990U,	// <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
-  2686451968U,	// <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2552823910U,	// <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
-  2691907922U,	// <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
-  2691907932U,	// <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
-  3626567830U,	// <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
-  2552827190U,	// <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
-  2631478582U,	// <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  3626570017U,	// <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
-  3668374839U,	// <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
-  2552829742U,	// <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
-  2558804070U,	// <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
-  1839644774U,	// <4,0,5,1>: Cost 2 vzipl RHS, LHS
-  2913386660U,	// <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
-  2570750420U,	// <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
-  2558807350U,	// <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
-  3987128750U,	// <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
-  3987128822U,	// <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
-  2594641208U,	// <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
-  1839645341U,	// <4,0,5,u>: Cost 2 vzipl RHS, LHS
-  2552840294U,	// <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
-  3047604234U,	// <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
-  1973862502U,	// <4,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2570758613U,	// <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
-  2552843574U,	// <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
-  2217664887U,	// <4,0,6,5>: Cost 3 vrev <0,4,5,6>
-  3662418528U,	// <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
-  2658022257U,	// <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
-  1973862556U,	// <4,0,6,u>: Cost 2 vtrnl RHS, LHS
-  3731764218U,	// <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
-  3988324454U,	// <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
-  4122034278U,	// <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
-  3735082246U,	// <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
-  3731764536U,	// <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
-  3937145718U,	// <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
-  3737073145U,	// <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
-  3731764844U,	// <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
-  4122034332U,	// <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
-  2552856678U,	// <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
-  1841635430U,	// <4,0,u,1>: Cost 2 vzipl RHS, LHS
-  1618166429U,	// <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570774999U,	// <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
-  2552859958U,	// <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
-  2631481498U,	// <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  2686157020U,	// <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  2594665787U,	// <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
-  1618166483U,	// <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2617548837U,	// <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
-  2622857318U,	// <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
-  3693281484U,	// <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
-  2691908342U,	// <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
-  2622857554U,	// <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
-  3764470538U,	// <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
-  3695272459U,	// <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
-  3733094980U,	// <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
-  2622857885U,	// <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
-  3696599798U,	// <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
-  2691097399U,	// <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
-  2631484314U,	// <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
-  2691908424U,	// <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
-  3696600125U,	// <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
-  3696600175U,	// <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
-  3696600307U,	// <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
-  3668423997U,	// <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
-  2691908469U,	// <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
-  2570797158U,	// <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
-  2570797978U,	// <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
-  3696600680U,	// <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
-  1618166682U,	// <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
-  2570800438U,	// <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
-  3765650347U,	// <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
-  3696601018U,	// <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
-  3668432190U,	// <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
-  1618535367U,	// <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
-  2564833382U,	// <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
-  2691908568U,	// <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
-  2691908578U,	// <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
-  2692572139U,	// <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
-  2564836662U,	// <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
-  2691908608U,	// <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
-  2588725862U,	// <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  3662468090U,	// <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
-  2691908631U,	// <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
-  3760194590U,	// <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
-  3693947874U,	// <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
-  3765650484U,	// <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
-  3113877606U,	// <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
-  3760194630U,	// <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
-  2622860598U,	// <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  3297436759U,	// <4,1,4,6>: Cost 4 vrev <1,4,6,4>
-  3800007772U,	// <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
-  2622860841U,	// <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
-  1479164006U,	// <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552906486U,	// <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
-  2552907299U,	// <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
-  2552907926U,	// <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
-  1479167286U,	// <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  2913387664U,	// <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
-  2600686074U,	// <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
-  2600686586U,	// <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479169838U,	// <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552914022U,	// <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
-  2558886708U,	// <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
-  4028205206U,	// <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
-  3089858662U,	// <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
-  2552917302U,	// <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
-  2223637584U,	// <4,1,6,5>: Cost 3 vrev <1,4,5,6>
-  4121347081U,	// <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
-  3721155406U,	// <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
-  2552919854U,	// <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
-  2659357716U,	// <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  3733763173U,	// <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
-  3734426806U,	// <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
-  2695226671U,	// <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
-  3721155942U,	// <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
-  3721155976U,	// <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
-  3662500458U,	// <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
-  3721156204U,	// <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
-  2659357716U,	// <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  1479188582U,	// <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
-  2552931062U,	// <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
-  2552931944U,	// <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
-  1622148480U,	// <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
-  1479191862U,	// <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
-  2622863514U,	// <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  2588725862U,	// <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2600686586U,	// <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479194414U,	// <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
-  2617557030U,	// <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
-  2622865510U,	// <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
-  2622865612U,	// <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
-  3693289753U,	// <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
-  2635473244U,	// <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
-  3765650918U,	// <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
-  2696775148U,	// <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
-  3695944285U,	// <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
-  2622866077U,	// <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
-  3696607990U,	// <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
-  3696608052U,	// <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
-  3696608150U,	// <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
-  3895574630U,	// <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
-  2691909162U,	// <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608400U,	// <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
-  3760784956U,	// <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
-  3773908549U,	// <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
-  2691909162U,	// <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608748U,	// <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
-  3696608828U,	// <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
-  2691909224U,	// <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
-  2691909234U,	// <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
-  3759605368U,	// <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
-  3696609156U,	// <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
-  3760785040U,	// <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
-  3668505927U,	// <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
-  2691909279U,	// <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
-  2691909286U,	// <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
-  3764840111U,	// <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
-  3765651129U,	// <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
-  2698544836U,	// <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
-  2685863630U,	// <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
-  2698692310U,	// <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
-  3772507871U,	// <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
-  2698839784U,	// <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
-  2691909358U,	// <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
-  2564915302U,	// <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
-  2564916122U,	// <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
-  2564917004U,	// <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
-  2699208469U,	// <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
-  2564918582U,	// <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
-  2622868790U,	// <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229667632U,	// <4,2,4,6>: Cost 3 vrev <2,4,6,4>
-  3800082229U,	// <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
-  2622869033U,	// <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
-  2552979558U,	// <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
-  2558952342U,	// <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
-  2564925032U,	// <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
-  2967060582U,	// <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
-  2552982838U,	// <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
-  3987130190U,	// <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
-  2913388474U,	// <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
-  3895577910U,	// <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
-  2552985390U,	// <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
-  1479245926U,	// <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
-  2552988406U,	// <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
-  2552989288U,	// <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
-  2954461286U,	// <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
-  1479249206U,	// <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
-  2229610281U,	// <4,2,6,5>: Cost 3 vrev <2,4,5,6>
-  2600767994U,	// <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
-  2600768506U,	// <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
-  1479251758U,	// <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
-  2659365909U,	// <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  3733771366U,	// <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
-  3734434999U,	// <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
-  2701199368U,	// <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
-  4175774618U,	// <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
-  3303360298U,	// <4,2,7,5>: Cost 4 vrev <2,4,5,7>
-  3727136217U,	// <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
-  3727136364U,	// <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
-  2659365909U,	// <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  1479262310U,	// <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
-  2553004790U,	// <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
-  2553005672U,	// <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
-  2954477670U,	// <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
-  1479265590U,	// <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
-  2622871706U,	// <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229700404U,	// <4,2,u,6>: Cost 3 vrev <2,4,6,u>
-  2600784890U,	// <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
-  1479268142U,	// <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
-  3765651595U,	// <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
-  2691909782U,	// <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
-  2702452897U,	// <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
-  3693297946U,	// <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
-  3760711856U,	// <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
-  2235533820U,	// <4,3,0,5>: Cost 3 vrev <3,4,5,0>
-  3309349381U,	// <4,3,0,6>: Cost 4 vrev <3,4,6,0>
-  3668563278U,	// <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
-  2691909845U,	// <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
-  2235173328U,	// <4,3,1,0>: Cost 3 vrev <3,4,0,1>
-  3764840678U,	// <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
-  2630173594U,	// <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
-  2703190267U,	// <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
-  3760195840U,	// <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
-  3765651724U,	// <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
-  3309357574U,	// <4,3,1,6>: Cost 4 vrev <3,4,6,1>
-  3769633054U,	// <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
-  2703558952U,	// <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
-  3626770534U,	// <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
-  2630174250U,	// <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
-  3765651777U,	// <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
-  2703853900U,	// <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
-  3626773814U,	// <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
-  2704001374U,	// <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
-  3765651814U,	// <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
-  3769633135U,	// <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
-  2634819681U,	// <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
-  3765651839U,	// <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
-  3765651848U,	// <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
-  3710552404U,	// <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
-  2691910044U,	// <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2704591270U,	// <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
-  3769633202U,	// <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
-  3703917212U,	// <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
-  3769633220U,	// <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
-  2691910044U,	// <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2691910096U,	// <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
-  2691910106U,	// <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
-  2564990741U,	// <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
-  3765651946U,	// <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
-  2691910136U,	// <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
-  2686454274U,	// <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
-  2235640329U,	// <4,3,4,6>: Cost 3 vrev <3,4,6,4>
-  3801483792U,	// <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
-  2691910168U,	// <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
-  2559025254U,	// <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559026237U,	// <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
-  2564998862U,	// <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
-  2570971548U,	// <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
-  2559028534U,	// <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
-  4163519477U,	// <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
-  3309390346U,	// <4,3,5,6>: Cost 4 vrev <3,4,6,5>
-  2706139747U,	// <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
-  2559031086U,	// <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559033446U,	// <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
-  2559034430U,	// <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
-  2565007127U,	// <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
-  2570979740U,	// <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
-  2559036726U,	// <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
-  1161841154U,	// <4,3,6,5>: Cost 2 vrev <3,4,5,6>
-  4028203932U,	// <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
-  2706803380U,	// <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
-  1162062365U,	// <4,3,6,u>: Cost 2 vrev <3,4,u,6>
-  3769633475U,	// <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
-  3769633488U,	// <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
-  3638757144U,	// <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
-  3769633508U,	// <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
-  3769633515U,	// <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
-  3769633526U,	// <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
-  3662647932U,	// <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
-  3781208837U,	// <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
-  3769633547U,	// <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
-  2559049830U,	// <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
-  2691910430U,	// <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
-  2565023513U,	// <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
-  2707835698U,	// <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
-  2559053110U,	// <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
-  1161857540U,	// <4,3,u,5>: Cost 2 vrev <3,4,5,u>
-  2235673101U,	// <4,3,u,6>: Cost 3 vrev <3,4,6,u>
-  2708130646U,	// <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
-  1162078751U,	// <4,3,u,u>: Cost 2 vrev <3,4,u,u>
-  2617573416U,	// <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
-  1570373734U,	// <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779676774U,	// <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  3760196480U,	// <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
-  2576977100U,	// <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
-  2718747538U,	// <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
-  2718747548U,	// <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
-  3668637015U,	// <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
-  1570374301U,	// <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
-  2644116214U,	// <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
-  2644116276U,	// <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
-  2691910602U,	// <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
-  2644116440U,	// <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
-  2711227356U,	// <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
-  2709310438U,	// <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
-  3765652462U,	// <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
-  3768970231U,	// <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
-  2695891968U,	// <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
-  3703260634U,	// <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
-  3765652499U,	// <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
-  2644117096U,	// <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
-  2631509709U,	// <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
-  2644117269U,	// <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
-  3705251698U,	// <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
-  2710047808U,	// <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
-  3783863369U,	// <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
-  2634827874U,	// <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
-  2644117654U,	// <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
-  3638797210U,	// <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
-  3638798082U,	// <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
-  2637482406U,	// <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  2638146039U,	// <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
-  3913287374U,	// <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
-  3765652625U,	// <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
-  3713878762U,	// <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
-  2637482406U,	// <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  1503264870U,	// <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
-  2577007514U,	// <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
-  2577008232U,	// <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
-  2571037175U,	// <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
-  161926454U,	// <4,4,4,4>: Cost 1 vdup0 RHS
-  1570377014U,	// <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
-  2779680054U,	// <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
-  2594927963U,	// <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U,	// <4,4,4,u>: Cost 1 vdup0 RHS
-  2571042918U,	// <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
-  2571043738U,	// <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
-  3638814495U,	// <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
-  2571045368U,	// <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
-  2571046198U,	// <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
-  1839648054U,	// <4,4,5,5>: Cost 2 vzipl RHS, RHS
-  1618169142U,	// <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594936156U,	// <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
-  1618169160U,	// <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  2553135206U,	// <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
-  3626877686U,	// <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
-  2565080782U,	// <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
-  2571053561U,	// <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
-  2553138486U,	// <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
-  2241555675U,	// <4,4,6,5>: Cost 3 vrev <4,4,5,6>
-  1973865782U,	// <4,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2658055029U,	// <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
-  1973865800U,	// <4,4,6,u>: Cost 2 vtrnl RHS, RHS
-  2644120570U,	// <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
-  3638829978U,	// <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
-  3638830881U,	// <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
-  3735115018U,	// <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
-  2662036827U,	// <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  2713292236U,	// <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
-  2713365973U,	// <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
-  2644121196U,	// <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
-  2662036827U,	// <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  1503297638U,	// <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
-  1570379566U,	// <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779682606U,	// <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  2571069947U,	// <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
-  161926454U,	// <4,4,u,4>: Cost 1 vdup0 RHS
-  1841638710U,	// <4,4,u,5>: Cost 2 vzipl RHS, RHS
-  1618169385U,	// <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594960735U,	// <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
-  161926454U,	// <4,4,u,u>: Cost 1 vdup0 RHS
-  2631516160U,	// <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
-  1557774438U,	// <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2618908875U,	// <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
-  2571078140U,	// <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
-  2626871634U,	// <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
-  3705258414U,	// <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
-  2594968438U,	// <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
-  2594968928U,	// <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
-  1557775005U,	// <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631516918U,	// <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
-  2624217939U,	// <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
-  2631517078U,	// <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
-  2821341286U,	// <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
-  3895086054U,	// <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
-  2626872471U,	// <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
-  3895083131U,	// <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
-  2718748368U,	// <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
-  2821341291U,	// <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
-  2571092070U,	// <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
-  3699287585U,	// <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
-  2630854269U,	// <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
-  1557776078U,	// <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
-  2631517974U,	// <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
-  3692652384U,	// <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
-  2631518138U,	// <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
-  4164013366U,	// <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
-  1561094243U,	// <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
-  2631518358U,	// <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
-  3895084710U,	// <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
-  2631518540U,	// <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
-  2631518620U,	// <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
-  2631518716U,	// <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
-  2631518784U,	// <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
-  2658060980U,	// <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
-  2640145131U,	// <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
-  2631519006U,	// <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
-  2571108454U,	// <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
-  3632907342U,	// <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
-  2571110094U,	// <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
-  2571110912U,	// <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
-  2571111734U,	// <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
-  1557777718U,	// <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2645454195U,	// <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
-  2718748614U,	// <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
-  1557777961U,	// <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
-  1503346790U,	// <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
-  2913398480U,	// <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
-  2631519998U,	// <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
-  2577090710U,	// <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
-  1503349978U,	// <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
-  2631520260U,	// <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
-  2913390690U,	// <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
-  2821344566U,	// <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
-  1503352622U,	// <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
-  1497383014U,	// <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
-  2559181904U,	// <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
-  2565154601U,	// <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
-  1497385474U,	// <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
-  1497386294U,	// <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
-  3047608324U,	// <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
-  2571129656U,	// <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
-  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
-  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
-  2565161062U,	// <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
-  2565161882U,	// <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
-  2565162794U,	// <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
-  2661381387U,	// <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
-  2565164342U,	// <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
-  2718748840U,	// <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
-  2718748846U,	// <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
-  2719412407U,	// <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
-  2565166894U,	// <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
-  1497399398U,	// <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
-  1557780270U,	// <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631522181U,	// <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
-  1497401860U,	// <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
-  1497402678U,	// <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
-  1557780634U,	// <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2631522512U,	// <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
-  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
-  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
-  2618916864U,	// <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
-  1545175142U,	// <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1545175244U,	// <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
-  3692658940U,	// <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
-  2618917202U,	// <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
-  3852910806U,	// <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
-  2253525648U,	// <4,6,0,6>: Cost 3 vrev <6,4,6,0>
-  4040764726U,	// <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
-  1545175709U,	// <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  2618917622U,	// <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
-  2618917684U,	// <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
-  2618917782U,	// <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
-  2618917848U,	// <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
-  3692659773U,	// <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
-  2618918032U,	// <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
-  3692659937U,	// <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
-  4032146742U,	// <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
-  2618918253U,	// <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
-  2618918380U,	// <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
-  2618918460U,	// <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
-  2618918504U,	// <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
-  2618918566U,	// <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
-  2618918679U,	// <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
-  2618918788U,	// <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
-  2618918842U,	// <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
-  2718749178U,	// <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
-  2618918971U,	// <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
-  2618919062U,	// <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
-  2636171526U,	// <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
-  3692661057U,	// <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
-  2618919324U,	// <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
-  2618919426U,	// <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
-  2638826058U,	// <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
-  3913303030U,	// <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
-  2722730572U,	// <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
-  2618919710U,	// <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
-  2565210214U,	// <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
-  2718749286U,	// <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
-  2565211952U,	// <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
-  2571184649U,	// <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
-  2565213494U,	// <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
-  1545178422U,	// <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705430326U,	// <4,6,4,6>: Cost 2 vuzpl RHS, RHS
-  2595075437U,	// <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
-  1545178665U,	// <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
-  2565218406U,	// <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
-  2645462736U,	// <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
-  2913399290U,	// <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
-  3913305394U,	// <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
-  2645462982U,	// <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
-  2779172868U,	// <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
-  2913391416U,	// <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
-  2821426486U,	// <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
-  2821426487U,	// <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
-  1503428710U,	// <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
-  2577171190U,	// <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
-  2645463546U,	// <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
-  2577172630U,	// <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
-  1503431908U,	// <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
-  2253501069U,	// <4,6,6,5>: Cost 3 vrev <6,4,5,6>
-  2618921784U,	// <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
-  2954464566U,	// <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
-  1503434542U,	// <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
-  2645464058U,	// <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
-  2779173882U,	// <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
-  3638978355U,	// <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
-  2725090156U,	// <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
-  2645464422U,	// <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
-  2779174246U,	// <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  3852915914U,	// <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
-  2779174508U,	// <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2779173945U,	// <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
-  1503445094U,	// <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
-  1545180974U,	// <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1705432878U,	// <4,6,u,2>: Cost 2 vuzpl RHS, LHS
-  2618922940U,	// <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
-  1503448294U,	// <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
-  1545181338U,	// <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705433242U,	// <4,6,u,6>: Cost 2 vuzpl RHS, RHS
-  2954480950U,	// <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
-  1545181541U,	// <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  3706601472U,	// <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
-  2632859750U,	// <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2726343685U,	// <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
-  3701293312U,	// <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
-  3706601810U,	// <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
-  2259424608U,	// <4,7,0,5>: Cost 3 vrev <7,4,5,0>
-  3695321617U,	// <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
-  3800454194U,	// <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
-  2632860317U,	// <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
-  2259064116U,	// <4,7,1,0>: Cost 3 vrev <7,4,0,1>
-  3700630324U,	// <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
-  2632860570U,	// <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
-  3769635936U,	// <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
-  3656920374U,	// <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
-  3700630681U,	// <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
-  3701294314U,	// <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
-  3793818754U,	// <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
-  2259654012U,	// <4,7,1,u>: Cost 3 vrev <7,4,u,1>
-  3656925286U,	// <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
-  3706603050U,	// <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
-  3706603112U,	// <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
-  2727744688U,	// <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
-  3705939745U,	// <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
-  2632861554U,	// <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
-  3706603450U,	// <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
-  3792491731U,	// <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
-  2634852453U,	// <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
-  3706603670U,	// <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
-  3662906266U,	// <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
-  3725183326U,	// <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
-  3706603932U,	// <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
-  3701295618U,	// <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
-  2638834251U,	// <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
-  2639497884U,	// <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
-  3802445093U,	// <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
-  2640825150U,	// <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
-  2718750004U,	// <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
-  3706604490U,	// <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
-  3656943474U,	// <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
-  3779884371U,	// <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
-  2259383643U,	// <4,7,4,4>: Cost 3 vrev <7,4,4,4>
-  2632863030U,	// <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
-  2259531117U,	// <4,7,4,6>: Cost 3 vrev <7,4,6,4>
-  3907340074U,	// <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
-  2632863273U,	// <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
-  2913391610U,	// <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
-  3645006848U,	// <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
-  2589181646U,	// <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
-  3645008403U,	// <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
-  2913391974U,	// <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
-  2583211973U,	// <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
-  2589184670U,	// <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
-  2913392236U,	// <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
-  2913392258U,	// <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
-  1509474406U,	// <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
-  3047609338U,	// <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
-  2583217768U,	// <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
-  2583218326U,	// <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
-  1509477686U,	// <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
-  1509478342U,	// <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
-  2583220730U,	// <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
-  3047609964U,	// <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509480238U,	// <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
-  3650994278U,	// <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
-  3650995098U,	// <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
-  3650996010U,	// <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
-  3804804677U,	// <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
-  3650997486U,	// <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
-  2662725039U,	// <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
-  3662942880U,	// <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
-  2718750316U,	// <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
-  2664715938U,	// <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
-  1509490790U,	// <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
-  2632865582U,	// <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2583234152U,	// <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
-  2583234710U,	// <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
-  1509494070U,	// <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
-  1509494728U,	// <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
-  2583237114U,	// <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
-  3047757420U,	// <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509496622U,	// <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
-  2618933248U,	// <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
-  1545191526U,	// <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1545191630U,	// <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
-  2691913445U,	// <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
-  2618933586U,	// <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
-  2265397305U,	// <4,u,0,5>: Cost 3 vrev <u,4,5,0>
-  2595189625U,	// <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
-  2595190139U,	// <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
-  1545192093U,	// <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
-  2618934006U,	// <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
-  2618934068U,	// <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
-  1618171694U,	// <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2618934232U,	// <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
-  2695894848U,	// <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
-  2618934416U,	// <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
-  3692676321U,	// <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
-  2718750555U,	// <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
-  1618171748U,	// <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2553397350U,	// <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
-  2630215215U,	// <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
-  2618934888U,	// <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
-  1557800657U,	// <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
-  2618935065U,	// <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
-  2733864859U,	// <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
-  2618935226U,	// <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
-  2718750636U,	// <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
-  1561118822U,	// <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
-  2618935446U,	// <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
-  2779318422U,	// <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
-  2636851545U,	// <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
-  2618935708U,	// <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
-  2618935810U,	// <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
-  2691913711U,	// <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
-  2588725862U,	// <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2640169710U,	// <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
-  2618936094U,	// <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
-  1503559782U,	// <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
-  2692282391U,	// <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
-  2565359426U,	// <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
-  2571332123U,	// <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
-  161926454U,	// <4,u,4,4>: Cost 1 vdup0 RHS
-  1545194806U,	// <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1705577782U,	// <4,u,4,6>: Cost 2 vuzpl RHS, RHS
-  2718750801U,	// <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
-  161926454U,	// <4,u,4,u>: Cost 1 vdup0 RHS
-  1479164006U,	// <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  1839650606U,	// <4,u,5,1>: Cost 2 vzipl RHS, LHS
-  2565367502U,	// <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
-  3089777309U,	// <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
-  1479167286U,	// <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  1839650970U,	// <4,u,5,5>: Cost 2 vzipl RHS, RHS
-  1618172058U,	// <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  3089780265U,	// <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
-  1618172076U,	// <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  1479688294U,	// <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
-  2553430774U,	// <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
-  1973868334U,	// <4,u,6,2>: Cost 2 vtrnl RHS, LHS
-  1497606685U,	// <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
-  1479691574U,	// <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
-  1509552079U,	// <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
-  1973868698U,	// <4,u,6,6>: Cost 2 vtrnl RHS, RHS
-  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
-  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
-  2565382246U,	// <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
-  2565383066U,	// <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
-  2565384005U,	// <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
-  2661405966U,	// <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
-  2565385526U,	// <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
-  2779321702U,	// <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  2589274793U,	// <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
-  2779321964U,	// <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2565388078U,	// <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
-  1479704678U,	// <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
-  1545197358U,	// <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1618172261U,	// <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  1497623071U,	// <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
-  161926454U,	// <4,u,u,4>: Cost 1 vdup0 RHS
-  1545197722U,	// <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1618172301U,	// <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
-  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
-  2687123456U,	// <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
-  2687123466U,	// <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
-  2687123476U,	// <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
-  3710599434U,	// <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
-  2642166098U,	// <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
-  3657060306U,	// <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
-  3292094923U,	// <5,0,0,6>: Cost 4 vrev <0,5,6,0>
-  3669005700U,	// <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
-  2687123530U,	// <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
-  2559434854U,	// <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
-  2559435887U,	// <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
-  1613381734U,	// <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  3698656256U,	// <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
-  2559438134U,	// <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
-  2583326675U,	// <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
-  3715908851U,	// <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
-  3657069562U,	// <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
-  1613381788U,	// <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2686017700U,	// <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
-  2685796528U,	// <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2698625208U,	// <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
-  2685944002U,	// <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
-  2686017739U,	// <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
-  2686091476U,	// <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
-  2725167324U,	// <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
-  2595280230U,	// <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  2686312687U,	// <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
-  3760128248U,	// <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
-  3759685888U,	// <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
-  2686533898U,	// <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
-  3760349459U,	// <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
-  2638187004U,	// <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
-  3776348452U,	// <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
-  3713256094U,	// <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
-  3914064896U,	// <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
-  2686976320U,	// <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
-  2559459430U,	// <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
-  1613381970U,	// <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  2687123804U,	// <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
-  3761013092U,	// <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
-  2559462710U,	// <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
-  2638187830U,	// <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  3761234303U,	// <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
-  2646150600U,	// <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1613381970U,	// <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  3766763926U,	// <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
-  2919268454U,	// <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
-  3053486182U,	// <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
-  3723210589U,	// <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
-  3766763966U,	// <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
-  2650796031U,	// <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
-  3719893090U,	// <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
-  3914067254U,	// <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
-  2919269021U,	// <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
-  4047519744U,	// <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
-  2920038502U,	// <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  3759759871U,	// <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
-  3645164070U,	// <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
-  3762414095U,	// <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
-  3993780690U,	// <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
-  3719893816U,	// <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
-  2662077302U,	// <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
-  2920039069U,	// <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
-  2565455974U,	// <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
-  2565456790U,	// <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
-  2565457742U,	// <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
-  3639199894U,	// <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
-  2565459254U,	// <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
-  2589347938U,	// <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
-  2589348530U,	// <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
-  4188456422U,	// <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
-  2565461806U,	// <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
-  2687124106U,	// <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
-  1616036502U,	// <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
-  1613382301U,	// <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  2689925800U,	// <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
-  2687124146U,	// <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
-  2638190746U,	// <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  2589356723U,	// <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
-  2595280230U,	// <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  1613382355U,	// <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2646818816U,	// <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
-  1573077094U,	// <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2646818980U,	// <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
-  2687124214U,	// <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
-  2641510738U,	// <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
-  2641510814U,	// <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
-  3720561142U,	// <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
-  3298141357U,	// <5,1,0,7>: Cost 4 vrev <1,5,7,0>
-  1573077661U,	// <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
-  2223891567U,	// <5,1,1,0>: Cost 3 vrev <1,5,0,1>
-  2687124276U,	// <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
-  2646819734U,	// <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
-  2687124296U,	// <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
-  2691326803U,	// <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
-  2691400540U,	// <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
-  3765216101U,	// <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
-  3765289838U,	// <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
-  2687124341U,	// <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
-  3297641584U,	// <5,1,2,0>: Cost 4 vrev <1,5,0,2>
-  3763520391U,	// <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
-  2646820456U,	// <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
-  2687124374U,	// <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
-  2691990436U,	// <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
-  2687124395U,	// <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
-  2646820794U,	// <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
-  3808199610U,	// <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
-  2687124419U,	// <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
-  2577440870U,	// <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
-  2687124440U,	// <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
-  3759686627U,	// <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
-  2692580332U,	// <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
-  2687124469U,	// <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
-  2685207552U,	// <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
-  3760866313U,	// <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
-  2692875280U,	// <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
-  2687124503U,	// <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
-  1567771538U,	// <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
-  2693096491U,	// <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
-  2693170228U,	// <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
-  2687124541U,	// <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
-  2646822096U,	// <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
-  1573080374U,	// <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646822260U,	// <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
-  3298174129U,	// <5,1,4,7>: Cost 4 vrev <1,5,7,4>
-  1573080602U,	// <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
-  2687124591U,	// <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
-  2646822543U,	// <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
-  3760866433U,	// <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
-  2687124624U,	// <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
-  2687124631U,	// <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
-  2646822916U,	// <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
-  2646823010U,	// <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
-  2646823080U,	// <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
-  2687124663U,	// <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
-  2553577574U,	// <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
-  3763520719U,	// <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
-  2646823418U,	// <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
-  3760866529U,	// <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
-  2553580854U,	// <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
-  2687124723U,	// <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
-  2646823736U,	// <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
-  2646823758U,	// <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
-  2646823839U,	// <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
-  2559557734U,	// <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
-  2559558452U,	// <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
-  2571503270U,	// <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
-  2040971366U,	// <5,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2559561014U,	// <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
-  2595393232U,	// <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
-  4188455035U,	// <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
-  2646824556U,	// <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
-  2040971371U,	// <5,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1591662326U,	// <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
-  1573082926U,	// <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2695824760U,	// <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
-  2040979558U,	// <5,1,u,3>: Cost 2 vtrnr RHS, LHS
-  2687124874U,	// <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
-  1573083290U,	// <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646825168U,	// <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
-  2646825216U,	// <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
-  2040979563U,	// <5,1,u,u>: Cost 2 vtrnr RHS, LHS
-  3702652928U,	// <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
-  2628911206U,	// <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2641518756U,	// <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
-  3759760847U,	// <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
-  3760866775U,	// <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
-  3759539680U,	// <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
-  3760866796U,	// <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
-  3304114054U,	// <5,2,0,7>: Cost 4 vrev <2,5,7,0>
-  2628911773U,	// <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
-  2623603464U,	// <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
-  3698008921U,	// <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
-  3633325603U,	// <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
-  2687125027U,	// <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
-  3633327414U,	// <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
-  3759539760U,	// <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
-  3760866876U,	// <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
-  3304122247U,	// <5,2,1,7>: Cost 4 vrev <2,5,7,1>
-  2687125072U,	// <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
-  3633332326U,	// <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
-  3759760992U,	// <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
-  2687125096U,	// <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
-  2687125106U,	// <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
-  2697963133U,	// <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
-  3759466120U,	// <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
-  3760866960U,	// <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
-  3771926168U,	// <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
-  2687125151U,	// <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
-  2687125158U,	// <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
-  2698405555U,	// <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
-  2577516238U,	// <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
-  3759687365U,	// <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
-  1624884942U,	// <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
-  2698700503U,	// <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
-  3772368608U,	// <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
-  3702655716U,	// <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
-  1625179890U,	// <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
-  2641521555U,	// <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
-  3772368642U,	// <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
-  2699142925U,	// <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
-  2698626838U,	// <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
-  2698626848U,	// <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
-  2628914486U,	// <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2645503353U,	// <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
-  3304146826U,	// <5,2,4,7>: Cost 4 vrev <2,5,7,4>
-  2628914729U,	// <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
-  2553643110U,	// <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
-  3758950227U,	// <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
-  3759761248U,	// <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
-  2982396006U,	// <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
-  2553646390U,	// <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
-  2553647108U,	// <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
-  3760867204U,	// <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
-  3702657141U,	// <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
-  2982396011U,	// <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
-  3627393126U,	// <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
-  3760867236U,	// <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
-  2645504506U,	// <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
-  2687125434U,	// <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
-  2700617665U,	// <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
-  3760867276U,	// <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
-  3763521493U,	// <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
-  3719246670U,	// <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
-  2687125479U,	// <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
-  2565603430U,	// <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
-  2553660150U,	// <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
-  2565605216U,	// <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
-  2961178726U,	// <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
-  2565606710U,	// <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
-  4034920552U,	// <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
-  3114713292U,	// <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
-  3702658668U,	// <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
-  2961178731U,	// <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
-  2687125563U,	// <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
-  2628917038U,	// <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2565613409U,	// <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
-  2687125592U,	// <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
-  1628203107U,	// <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
-  2628917402U,	// <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2702092405U,	// <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
-  3304179598U,	// <5,2,u,7>: Cost 4 vrev <2,5,7,u>
-  1628498055U,	// <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
-  3760867467U,	// <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
-  2687125654U,	// <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
-  3759761565U,	// <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
-  3633391766U,	// <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
-  2687125680U,	// <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
-  3760277690U,	// <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
-  3310013014U,	// <5,3,0,6>: Cost 4 vrev <3,5,6,0>
-  2236344927U,	// <5,3,0,7>: Cost 3 vrev <3,5,7,0>
-  2687125717U,	// <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
-  3760867551U,	// <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
-  3760867558U,	// <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
-  2624938923U,	// <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
-  2703198460U,	// <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
-  3760867587U,	// <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
-  2636219536U,	// <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
-  3698681075U,	// <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
-  2703493408U,	// <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
-  2628920721U,	// <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
-  3766765870U,	// <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
-  3698681379U,	// <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
-  3760867649U,	// <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
-  2698627404U,	// <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
-  2703935830U,	// <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
-  2698627422U,	// <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
-  3760867686U,	// <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
-  3769788783U,	// <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
-  2701945209U,	// <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
-  3760867711U,	// <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
-  2636220684U,	// <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
-  3772369298U,	// <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
-  2687125916U,	// <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
-  2704599463U,	// <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
-  2704673200U,	// <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
-  3709962935U,	// <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
-  3772369346U,	// <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
-  2704894411U,	// <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
-  2704968148U,	// <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
-  3698682850U,	// <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
-  2642857014U,	// <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
-  2705189359U,	// <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
-  2705263096U,	// <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
-  2685946370U,	// <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
-  3779152394U,	// <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
-  2236377699U,	// <5,3,4,7>: Cost 3 vrev <3,5,7,4>
-  2687126045U,	// <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
-  2571632742U,	// <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
-  2559689870U,	// <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
-  2571634382U,	// <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
-  2571635264U,	// <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
-  2571636022U,	// <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
-  2559692804U,	// <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
-  3720581218U,	// <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
-  2236385892U,	// <5,3,5,7>: Cost 3 vrev <3,5,7,5>
-  2571638574U,	// <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
-  2565668966U,	// <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
-  3633439887U,	// <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
-  2565670760U,	// <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
-  2565671426U,	// <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
-  2565672246U,	// <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
-  3639414630U,	// <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
-  4047521640U,	// <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
-  2725169844U,	// <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
-  2565674798U,	// <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
-  1485963366U,	// <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485964432U,	// <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
-  2559706728U,	// <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
-  2559707286U,	// <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
-  1485966646U,	// <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
-  2559708880U,	// <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
-  2601513466U,	// <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
-  3114714112U,	// <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
-  1485969198U,	// <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485971558U,	// <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
-  1485972625U,	// <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
-  2559714920U,	// <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
-  2559715478U,	// <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
-  1485974838U,	// <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
-  2687126342U,	// <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
-  2601521658U,	// <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
-  2236410471U,	// <5,3,u,7>: Cost 3 vrev <3,5,7,u>
-  1485977390U,	// <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
-  3627491430U,	// <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
-  2636890214U,	// <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
-  3703333028U,	// <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
-  3782249348U,	// <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
-  2642198866U,	// <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
-  2687126418U,	// <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
-  2242243887U,	// <5,4,0,6>: Cost 3 vrev <4,5,6,0>
-  3316059448U,	// <5,4,0,7>: Cost 4 vrev <4,5,7,0>
-  2636890781U,	// <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
-  2241809658U,	// <5,4,1,0>: Cost 3 vrev <4,5,0,1>
-  3698025307U,	// <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
-  3698688940U,	// <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
-  3698689024U,	// <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
-  3700016206U,	// <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
-  2687126498U,	// <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
-  3760868336U,	// <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
-  3316067641U,	// <5,4,1,7>: Cost 4 vrev <4,5,7,1>
-  2242399554U,	// <5,4,1,u>: Cost 3 vrev <4,5,u,1>
-  3703334371U,	// <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
-  3703998004U,	// <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
-  3704661637U,	// <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
-  2636891854U,	// <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
-  3705988903U,	// <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
-  2698628150U,	// <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
-  3760868415U,	// <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
-  3783871562U,	// <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
-  2666752099U,	// <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
-  3639459942U,	// <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
-  3709970701U,	// <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
-  2636892510U,	// <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
-  3710634396U,	// <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
-  2638219776U,	// <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
-  3766987908U,	// <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
-  2710719634U,	// <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
-  3914097664U,	// <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
-  2640874308U,	// <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
-  2583642214U,	// <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
-  2642201574U,	// <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
-  3710635062U,	// <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
-  3717270664U,	// <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
-  2713963728U,	// <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
-  1637567706U,	// <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
-  2242276659U,	// <5,4,4,6>: Cost 3 vrev <4,5,6,4>
-  2646183372U,	// <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
-  1637788917U,	// <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
-  2559762534U,	// <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
-  2559763607U,	// <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
-  2698628366U,	// <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
-  3633506454U,	// <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
-  2559765814U,	// <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
-  2583654395U,	// <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
-  1613385014U,	// <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  3901639990U,	// <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
-  1613385032U,	// <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
-  2559770726U,	// <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
-  2559771648U,	// <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
-  3633514088U,	// <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
-  2571717122U,	// <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
-  2559774006U,	// <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
-  2712636796U,	// <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
-  3760868743U,	// <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
-  2712784270U,	// <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
-  2559776558U,	// <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
-  2565750886U,	// <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
-  2565751706U,	// <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
-  2565752690U,	// <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
-  2571725387U,	// <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
-  2565754166U,	// <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
-  3114713426U,	// <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
-  94817590U,	// <5,4,7,6>: Cost 1 vrev RHS
-  2595616175U,	// <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
-  94965064U,	// <5,4,7,u>: Cost 1 vrev RHS
-  2559787110U,	// <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
-  2559788186U,	// <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
-  2242014483U,	// <5,4,u,2>: Cost 3 vrev <4,5,2,u>
-  2667419628U,	// <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
-  2559790390U,	// <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
-  1640222238U,	// <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
-  94825783U,	// <5,4,u,6>: Cost 1 vrev RHS
-  2714111536U,	// <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
-  94973257U,	// <5,4,u,u>: Cost 1 vrev RHS
-  2646851584U,	// <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
-  1573109862U,	// <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646851748U,	// <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
-  3760279130U,	// <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
-  2687127138U,	// <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
-  2248142847U,	// <5,5,0,5>: Cost 3 vrev <5,5,5,0>
-  3720593910U,	// <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
-  4182502710U,	// <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
-  1573110429U,	// <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646852342U,	// <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
-  2624291676U,	// <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
-  2646852502U,	// <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
-  2646852568U,	// <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
-  2715217591U,	// <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
-  2628936848U,	// <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
-  3698033907U,	// <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
-  2713964240U,	// <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
-  2628937107U,	// <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
-  3645497446U,	// <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
-  3760869099U,	// <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
-  2646853224U,	// <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
-  2698628862U,	// <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
-  3772370694U,	// <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
-  2713964303U,	// <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
-  2646853562U,	// <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
-  4038198272U,	// <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
-  2701946667U,	// <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
-  2646853782U,	// <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
-  3698034922U,	// <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
-  3702679919U,	// <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
-  2637564336U,	// <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
-  2646854146U,	// <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
-  2638891602U,	// <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
-  3702680247U,	// <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
-  3702680259U,	// <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
-  2646854430U,	// <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
-  2646854546U,	// <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
-  2642209767U,	// <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
-  3711306806U,	// <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
-  3645516369U,	// <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
-  1570458842U,	// <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1573113142U,	// <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
-  2645527932U,	// <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
-  2713964486U,	// <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
-  1573113374U,	// <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
-  1509982310U,	// <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2646855376U,	// <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
-  2583725672U,	// <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
-  2583726230U,	// <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
-  1509985590U,	// <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U,	// <5,5,5,5>: Cost 1 vdup1 RHS
-  2646855778U,	// <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
-  2646855848U,	// <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
-  229035318U,	// <5,5,5,u>: Cost 1 vdup1 RHS
-  2577760358U,	// <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
-  3633587361U,	// <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
-  2646856186U,	// <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
-  3633588738U,	// <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
-  2718535756U,	// <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
-  2644202223U,	// <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
-  2973780482U,	// <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
-  2646856526U,	// <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
-  2646856607U,	// <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
-  2571796582U,	// <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
-  3633595392U,	// <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
-  2571798222U,	// <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
-  2571799124U,	// <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
-  2571799862U,	// <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
-  3114717188U,	// <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
-  4034923010U,	// <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
-  2040974646U,	// <5,5,7,7>: Cost 2 vtrnr RHS, RHS
-  2040974647U,	// <5,5,7,u>: Cost 2 vtrnr RHS, RHS
-  1509982310U,	// <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  1573115694U,	// <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2571806414U,	// <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
-  2571807317U,	// <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
-  1509985590U,	// <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U,	// <5,5,u,5>: Cost 1 vdup1 RHS
-  2646857936U,	// <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
-  2040982838U,	// <5,5,u,7>: Cost 2 vtrnr RHS, RHS
-  229035318U,	// <5,5,u,u>: Cost 1 vdup1 RHS
-  2638233600U,	// <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
-  1564491878U,	// <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  2632261796U,	// <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
-  2638233856U,	// <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
-  2638233938U,	// <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
-  3706003885U,	// <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
-  3706003967U,	// <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
-  4047473974U,	// <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
-  1564492445U,	// <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
-  2638234358U,	// <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
-  2638234420U,	// <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
-  2638234518U,	// <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
-  2638234584U,	// <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
-  2626290768U,	// <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
-  2638234768U,	// <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
-  3700032719U,	// <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
-  2982366518U,	// <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  2628945300U,	// <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
-  3706004925U,	// <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
-  3711976966U,	// <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
-  2638235240U,	// <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
-  2638235302U,	// <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
-  2632263465U,	// <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
-  2638235496U,	// <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
-  2638235578U,	// <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
-  2713965050U,	// <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
-  2634917997U,	// <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
-  2638235798U,	// <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
-  3711977695U,	// <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
-  3710650720U,	// <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
-  2638236060U,	// <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
-  1564494338U,	// <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
-  2638236234U,	// <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
-  3711978104U,	// <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
-  4034227510U,	// <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
-  1567148870U,	// <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
-  2577817702U,	// <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
-  3700034544U,	// <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
-  2723033713U,	// <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
-  2638236818U,	// <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
-  2644208859U,	// <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
-  1564495158U,	// <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  2645536125U,	// <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
-  2723402398U,	// <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
-  1564495401U,	// <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
-  2577825894U,	// <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
-  2662125264U,	// <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
-  3775836867U,	// <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
-  3711979343U,	// <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
-  2650181556U,	// <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
-  2662125572U,	// <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
-  2638237732U,	// <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
-  2982399286U,	// <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
-  2982399287U,	// <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
-  2583806054U,	// <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
-  3711979910U,	// <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
-  2662126074U,	// <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
-  2583808514U,	// <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
-  2583809334U,	// <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
-  2583810062U,	// <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
-  2638238520U,	// <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
-  2973781302U,	// <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2973781303U,	// <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
-  430358630U,	// <5,6,7,0>: Cost 1 vext1 RHS, LHS
-  1504101110U,	// <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1504101992U,	// <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504102550U,	// <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430361910U,	// <5,6,7,4>: Cost 1 vext1 RHS, RHS
-  1504104390U,	// <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
-  1504105272U,	// <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
-  1504106092U,	// <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
-  430364462U,	// <5,6,7,u>: Cost 1 vext1 RHS, LHS
-  430366822U,	// <5,6,u,0>: Cost 1 vext1 RHS, LHS
-  1564497710U,	// <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  1504110184U,	// <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504110742U,	// <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430370103U,	// <5,6,u,4>: Cost 1 vext1 RHS, RHS
-  1564498074U,	// <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  1504113146U,	// <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1504113658U,	// <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
-  430372654U,	// <5,6,u,u>: Cost 1 vext1 RHS, LHS
-  2625634304U,	// <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
-  1551892582U,	// <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625634468U,	// <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
-  2571889247U,	// <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
-  2625634642U,	// <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
-  2595778728U,	// <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
-  3699376639U,	// <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
-  2260235715U,	// <5,7,0,7>: Cost 3 vrev <7,5,7,0>
-  1551893149U,	// <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625635062U,	// <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
-  2624308020U,	// <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
-  2625635222U,	// <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
-  1551893504U,	// <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
-  2571898166U,	// <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
-  2625635472U,	// <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
-  2627626227U,	// <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
-  3702031684U,	// <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
-  1555211669U,	// <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
-  2629617126U,	// <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
-  3699377670U,	// <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
-  2625635944U,	// <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
-  2625636006U,	// <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
-  2632271658U,	// <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
-  2625636201U,	// <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
-  2625636282U,	// <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
-  3708004381U,	// <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
-  2625636411U,	// <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
-  2625636502U,	// <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
-  2625636604U,	// <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
-  3699378478U,	// <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
-  2625636764U,	// <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
-  2625636866U,	// <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
-  2625636959U,	// <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
-  3699378808U,	// <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
-  2640235254U,	// <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
-  2625637150U,	// <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
-  2571919462U,	// <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
-  2571920384U,	// <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
-  3699379260U,	// <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
-  2571922019U,	// <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
-  2571922742U,	// <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
-  1551895862U,	// <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2846277980U,	// <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646207951U,	// <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
-  1551896105U,	// <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
-  2583871590U,	// <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
-  2652180176U,	// <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
-  2625638177U,	// <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
-  2625638262U,	// <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
-  2583874870U,	// <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
-  2846281732U,	// <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
-  2651517015U,	// <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
-  1772539190U,	// <5,7,5,7>: Cost 2 vuzpr RHS, RHS
-  1772539191U,	// <5,7,5,u>: Cost 2 vuzpr RHS, RHS
-  2846281826U,	// <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
-  3699380615U,	// <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
-  2846281108U,	// <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
-  2589854210U,	// <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
-  2846281830U,	// <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
-  2725467658U,	// <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
-  2846281076U,	// <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2846279610U,	// <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
-  2846279611U,	// <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
-  1510146150U,	// <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
-  2846282574U,	// <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
-  2583889512U,	// <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
-  2846281919U,	// <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
-  1510149430U,	// <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
-  1510150168U,	// <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
-  2583892474U,	// <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
-  2625640044U,	// <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
-  1510151982U,	// <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
-  1510154342U,	// <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
-  1551898414U,	// <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625640325U,	// <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
-  1772536477U,	// <5,7,u,3>: Cost 2 vuzpr RHS, LHS
-  1510157622U,	// <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
-  1551898778U,	// <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2625640656U,	// <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
-  1772539433U,	// <5,7,u,7>: Cost 2 vuzpr RHS, RHS
-  1551898981U,	// <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625642496U,	// <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
-  1551900774U,	// <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625642660U,	// <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
-  2698630885U,	// <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
-  2687129325U,	// <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
-  2689783542U,	// <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
-  2266134675U,	// <5,u,0,6>: Cost 3 vrev <u,5,6,0>
-  2595853772U,	// <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
-  1551901341U,	// <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625643254U,	// <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
-  2625643316U,	// <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
-  1613387566U,	// <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1551901697U,	// <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
-  2626307154U,	// <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
-  2689783622U,	// <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
-  2627634420U,	// <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
-  2982366536U,	// <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  1613387620U,	// <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2846286742U,	// <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
-  2685796528U,	// <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2625644136U,	// <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
-  2687129480U,	// <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
-  2632279851U,	// <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
-  2625644394U,	// <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
-  2625644474U,	// <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
-  2713966508U,	// <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
-  2625644603U,	// <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
-  2687129532U,	// <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
-  2636261649U,	// <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
-  2636925282U,	// <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
-  2625644956U,	// <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
-  1564510724U,	// <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
-  2625645160U,	// <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
-  2734610422U,	// <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
-  2640243447U,	// <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
-  1567165256U,	// <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
-  1567828889U,	// <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
-  1661163546U,	// <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
-  2734463012U,	// <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
-  2698631212U,	// <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
-  1570458842U,	// <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1551904054U,	// <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
-  2846286172U,	// <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646216144U,	// <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
-  1551904297U,	// <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
-  1509982310U,	// <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2560058555U,	// <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
-  2698926194U,	// <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
-  2698631295U,	// <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
-  1509985590U,	// <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U,	// <5,u,5,5>: Cost 1 vdup1 RHS
-  1613387930U,	// <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  1772547382U,	// <5,u,5,7>: Cost 2 vuzpr RHS, RHS
-  229035318U,	// <5,u,5,u>: Cost 1 vdup1 RHS
-  2566037606U,	// <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
-  2920044334U,	// <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  2566039445U,	// <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
-  2687129808U,	// <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
-  2566040886U,	// <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
-  2920044698U,	// <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
-  2846289268U,	// <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2973781320U,	// <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2687129853U,	// <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
-  430506086U,	// <5,u,7,0>: Cost 1 vext1 RHS, LHS
-  1486333117U,	// <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
-  1504249448U,	// <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  2040971933U,	// <5,u,7,3>: Cost 2 vtrnr RHS, LHS
-  430509384U,	// <5,u,7,4>: Cost 1 vext1 RHS, RHS
-  1504251600U,	// <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  118708378U,	// <5,u,7,6>: Cost 1 vrev RHS
-  2040974889U,	// <5,u,7,7>: Cost 2 vtrnr RHS, RHS
-  430511918U,	// <5,u,7,u>: Cost 1 vext1 RHS, LHS
-  430514278U,	// <5,u,u,0>: Cost 1 vext1 RHS, LHS
-  1551906606U,	// <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  1613388133U,	// <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1772544669U,	// <5,u,u,3>: Cost 2 vuzpr RHS, LHS
-  430517577U,	// <5,u,u,4>: Cost 1 vext1 RHS, RHS
-  229035318U,	// <5,u,u,5>: Cost 1 vdup1 RHS
-  118716571U,	// <5,u,u,6>: Cost 1 vrev RHS
-  1772547625U,	// <5,u,u,7>: Cost 2 vuzpr RHS, RHS
-  430520110U,	// <5,u,u,u>: Cost 1 vext1 RHS, LHS
-  2686025728U,	// <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
-  2686025738U,	// <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
-  2686025748U,	// <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
-  3779084320U,	// <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
-  2642903388U,	// <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
-  3657723939U,	// <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
-  3926676514U,	// <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
-  3926675786U,	// <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
-  2686025802U,	// <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
-  2566070374U,	// <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
-  3759767642U,	// <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
-  1612284006U,	// <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2583988738U,	// <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
-  2566073654U,	// <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
-  2583990308U,	// <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
-  2589963005U,	// <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
-  2595935702U,	// <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
-  1612284060U,	// <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2686025892U,	// <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
-  2685804721U,	// <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
-  3759620282U,	// <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
-  2705342658U,	// <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
-  1612284108U,	// <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
-  3706029956U,	// <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
-  2686173406U,	// <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
-  3651769338U,	// <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
-  1612579056U,	// <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
-  3706030230U,	// <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
-  2705342720U,	// <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
-  2705342730U,	// <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
-  3706030492U,	// <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
-  2644896258U,	// <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
-  3718638154U,	// <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
-  3729918619U,	// <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
-  3926672384U,	// <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
-  2705342784U,	// <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
-  2687058250U,	// <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
-  2686026066U,	// <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
-  1613463900U,	// <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
-  3761021285U,	// <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
-  2687353198U,	// <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
-  2632289590U,	// <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2645560704U,	// <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
-  2646224337U,	// <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
-  1613906322U,	// <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
-  3651788902U,	// <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
-  2687795620U,	// <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
-  3761611181U,	// <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
-  3723284326U,	// <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
-  2646224838U,	// <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
-  3718639630U,	// <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
-  2652196962U,	// <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
-  2852932918U,	// <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852932919U,	// <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852933730U,	// <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
-  2925985894U,	// <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
-  3060203622U,	// <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
-  3718640178U,	// <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
-  2656178832U,	// <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
-  3725939378U,	// <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
-  2657506098U,	// <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
-  2619020110U,	// <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
-  2925986461U,	// <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
-  2572091494U,	// <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
-  2572092310U,	// <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
-  2980495524U,	// <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
-  2572094072U,	// <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
-  2572094774U,	// <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
-  4054238242U,	// <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
-  3645837653U,	// <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
-  4054239054U,	// <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
-  2572097326U,	// <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
-  2686026378U,	// <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
-  2686026386U,	// <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
-  1612284573U,	// <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2705343144U,	// <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
-  1616265906U,	// <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
-  2632292506U,	// <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2590020356U,	// <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
-  2852933161U,	// <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  1612284627U,	// <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2595995750U,	// <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
-  2646229094U,	// <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
-  3694092492U,	// <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
-  2686026486U,	// <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
-  2595999030U,	// <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
-  3767730952U,	// <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
-  2596000590U,	// <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
-  2596001246U,	// <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
-  2686026531U,	// <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
-  3763602219U,	// <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
-  2686026548U,	// <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
-  3764929346U,	// <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
-  2686026568U,	// <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
-  2691334996U,	// <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
-  3760874332U,	// <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
-  3765224294U,	// <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
-  3669751263U,	// <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
-  2686026613U,	// <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
-  2554208358U,	// <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
-  3763602311U,	// <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
-  3639895971U,	// <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
-  2686026646U,	// <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
-  2554211638U,	// <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
-  3760874411U,	// <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
-  2554212858U,	// <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
-  3802973114U,	// <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
-  2686026691U,	// <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
-  2566160486U,	// <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
-  2686026712U,	// <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
-  2686026724U,	// <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
-  3759768552U,	// <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
-  2692662262U,	// <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
-  2686026752U,	// <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
-  2590053128U,	// <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
-  3663795194U,	// <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
-  2686026775U,	// <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
-  2641587099U,	// <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
-  2693104684U,	// <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
-  3639912357U,	// <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
-  2687206462U,	// <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
-  3633941814U,	// <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
-  2693399632U,	// <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
-  3765077075U,	// <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
-  2646232530U,	// <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
-  2687206507U,	// <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
-  2647559796U,	// <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
-  3765077118U,	// <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
-  3767583878U,	// <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
-  2686026896U,	// <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
-  2693989528U,	// <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
-  3767805089U,	// <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
-  2652868706U,	// <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
-  3908250934U,	// <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
-  2686026941U,	// <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
-  2554241126U,	// <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
-  3763602639U,	// <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
-  3759547607U,	// <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
-  3115221094U,	// <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2554244406U,	// <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
-  3760874739U,	// <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
-  2554245944U,	// <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
-  3719975758U,	// <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
-  3115221099U,	// <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2560221286U,	// <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560222415U,	// <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
-  2980497558U,	// <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
-  3103211622U,	// <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
-  2560224566U,	// <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
-  2980495698U,	// <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
-  3633967526U,	// <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
-  4054237686U,	// <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
-  2560227118U,	// <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560229478U,	// <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
-  2686027117U,	// <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
-  2686027129U,	// <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
-  2686027132U,	// <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
-  2687206795U,	// <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
-  2686027157U,	// <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
-  2590094093U,	// <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
-  2596066790U,	// <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
-  2686027177U,	// <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
-  2646900736U,	// <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
-  1573159014U,	// <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2646900900U,	// <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
-  3759769037U,	// <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
-  2641592668U,	// <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
-  3779085794U,	// <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
-  2686027244U,	// <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
-  3669816807U,	// <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
-  1573159581U,	// <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
-  2230527897U,	// <6,2,1,0>: Cost 3 vrev <2,6,0,1>
-  2646901556U,	// <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
-  2646901654U,	// <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
-  2847047782U,	// <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
-  3771049517U,	// <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
-  2646901904U,	// <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
-  2686027324U,	// <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
-  3669825000U,	// <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
-  2231117793U,	// <6,2,1,u>: Cost 3 vrev <2,6,u,1>
-  3763603029U,	// <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
-  3759769184U,	// <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
-  2686027368U,	// <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
-  2686027378U,	// <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
-  2697971326U,	// <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
-  3759769224U,	// <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
-  2698118800U,	// <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
-  3920794092U,	// <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
-  2686027423U,	// <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
-  2686027430U,	// <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
-  3759769262U,	// <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
-  2698487485U,	// <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
-  2705344196U,	// <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
-  2686027470U,	// <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
-  2698708696U,	// <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
-  2724660961U,	// <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
-  2729232104U,	// <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
-  2686027502U,	// <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
-  1567853468U,	// <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  3759769351U,	// <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
-  2699151118U,	// <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
-  2686027543U,	// <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
-  2699298592U,	// <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
-  1573162294U,	// <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686027564U,	// <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
-  3719982547U,	// <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
-  1573162532U,	// <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
-  3779086154U,	// <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
-  2646904528U,	// <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
-  3759769440U,	// <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
-  2699888488U,	// <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
-  2230855617U,	// <6,2,5,4>: Cost 3 vrev <2,6,4,5>
-  2646904836U,	// <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
-  2646904930U,	// <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
-  2847051062U,	// <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  2700257173U,	// <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
-  2687207321U,	// <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
-  2686027684U,	// <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
-  2566260656U,	// <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
-  2685806522U,	// <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
-  2687207361U,	// <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
-  2686027724U,	// <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
-  2646905656U,	// <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
-  2646905678U,	// <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
-  2686027751U,	// <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
-  2554323046U,	// <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
-  2572239606U,	// <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
-  2566268849U,	// <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
-  1906753638U,	// <6,2,7,3>: Cost 2 vzipr RHS, LHS
-  2554326326U,	// <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
-  3304687564U,	// <6,2,7,5>: Cost 4 vrev <2,6,5,7>
-  2980495708U,	// <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2646906476U,	// <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
-  1906753643U,	// <6,2,7,u>: Cost 2 vzipr RHS, LHS
-  1591744256U,	// <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
-  1573164846U,	// <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2701805650U,	// <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
-  1906761830U,	// <6,2,u,3>: Cost 2 vzipr RHS, LHS
-  2686027875U,	// <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
-  1573165210U,	// <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686322800U,	// <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
-  2847051305U,	// <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  1906761835U,	// <6,2,u,u>: Cost 2 vzipr RHS, LHS
-  3759769739U,	// <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
-  2686027926U,	// <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
-  2686027937U,	// <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
-  3640027286U,	// <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
-  2687207601U,	// <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
-  2705344698U,	// <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
-  3663917847U,	// <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
-  2237008560U,	// <6,3,0,7>: Cost 3 vrev <3,6,7,0>
-  2686027989U,	// <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
-  3759769823U,	// <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
-  3759769830U,	// <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
-  3759769841U,	// <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
-  3759769848U,	// <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
-  2703280390U,	// <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3759769868U,	// <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
-  3704063194U,	// <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
-  3767732510U,	// <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
-  2703280390U,	// <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3704063468U,	// <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
-  2630321724U,	// <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769921U,	// <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
-  3759769928U,	// <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
-  3704063767U,	// <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
-  3704063876U,	// <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
-  2636957626U,	// <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
-  3777907058U,	// <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
-  2630321724U,	// <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769983U,	// <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
-  3710036245U,	// <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
-  2636958054U,	// <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
-  2686028188U,	// <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
-  2704607656U,	// <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
-  3773041072U,	// <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
-  3711363731U,	// <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
-  3767732676U,	// <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
-  2707999179U,	// <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
-  2584232038U,	// <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
-  2642267118U,	// <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
-  2642930751U,	// <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
-  2705197552U,	// <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
-  2584235318U,	// <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
-  1631603202U,	// <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
-  2654211444U,	// <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
-  2237041332U,	// <6,3,4,7>: Cost 3 vrev <3,6,7,4>
-  1631824413U,	// <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
-  3640066150U,	// <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
-  3772746288U,	// <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
-  3640067790U,	// <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
-  3773041216U,	// <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
-  2705934922U,	// <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
-  3773041236U,	// <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
-  3779086940U,	// <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
-  3767732831U,	// <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
-  2706229870U,	// <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
-  2602164326U,	// <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
-  2654212512U,	// <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
-  2566334393U,	// <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
-  3704066588U,	// <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
-  2602167524U,	// <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
-  3710702321U,	// <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
-  2724661933U,	// <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
-  3710702465U,	// <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
-  2602170158U,	// <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
-  1492598886U,	// <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
-  2560369889U,	// <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
-  1492600762U,	// <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
-  2566342806U,	// <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
-  1492602166U,	// <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
-  2602176208U,	// <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
-  2566345210U,	// <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
-  2980496528U,	// <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492604718U,	// <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
-  1492607078U,	// <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
-  2686028574U,	// <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
-  1492608955U,	// <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
-  2566350998U,	// <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
-  1492610358U,	// <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
-  1634257734U,	// <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
-  2566353489U,	// <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
-  2980504720U,	// <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492612910U,	// <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
-  3703406592U,	// <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
-  2629664870U,	// <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2629664972U,	// <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
-  3779087232U,	// <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
-  2642936156U,	// <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
-  2712570770U,	// <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
-  2687208348U,	// <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
-  3316723081U,	// <6,4,0,7>: Cost 4 vrev <4,6,7,0>
-  2629665437U,	// <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
-  2242473291U,	// <6,4,1,0>: Cost 3 vrev <4,6,0,1>
-  3700089652U,	// <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
-  3703407510U,	// <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
-  2852962406U,	// <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
-  3628166454U,	// <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
-  3760876514U,	// <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
-  2687208430U,	// <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
-  3316731274U,	// <6,4,1,7>: Cost 4 vrev <4,6,7,1>
-  2243063187U,	// <6,4,1,u>: Cost 3 vrev <4,6,u,1>
-  2629666284U,	// <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
-  3703408188U,	// <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
-  3703408232U,	// <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
-  3703408294U,	// <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
-  2632320816U,	// <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
-  2923384118U,	// <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
-  2687208508U,	// <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
-  3760950341U,	// <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
-  2634975348U,	// <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
-  3703408790U,	// <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
-  3316305238U,	// <6,4,3,1>: Cost 4 vrev <4,6,1,3>
-  3703408947U,	// <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
-  3703409052U,	// <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
-  2644929026U,	// <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
-  3718670922U,	// <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
-  2705345682U,	// <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
-  3926705152U,	// <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
-  2668817222U,	// <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
-  2590277734U,	// <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
-  3716017135U,	// <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
-  2642938944U,	// <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
-  3717344401U,	// <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
-  2712571088U,	// <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
-  2629668150U,	// <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1637649636U,	// <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2646257109U,	// <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
-  1637649636U,	// <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2566398054U,	// <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
-  3760876805U,	// <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
-  2566399937U,	// <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
-  2584316418U,	// <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
-  2566401334U,	// <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
-  2584318028U,	// <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
-  1612287286U,	// <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965686U,	// <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287304U,	// <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504608358U,	// <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2578350838U,	// <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
-  2578351720U,	// <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
-  2578352278U,	// <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
-  1504611638U,	// <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
-  2578353872U,	// <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
-  2578354682U,	// <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
-  2578355194U,	// <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
-  1504614190U,	// <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
-  2572386406U,	// <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
-  2572387226U,	// <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
-  3640157902U,	// <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
-  2572389020U,	// <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
-  2572389686U,	// <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
-  2980497102U,	// <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
-  2980495564U,	// <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
-  4054239090U,	// <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
-  2572392238U,	// <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
-  1504608358U,	// <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2629670702U,	// <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2566424516U,	// <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
-  2584340994U,	// <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
-  1640156694U,	// <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
-  2629671066U,	// <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1612287529U,	// <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965929U,	// <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287547U,	// <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  3708723200U,	// <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
-  2634981478U,	// <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
-  3694125260U,	// <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
-  3779087962U,	// <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
-  3760877154U,	// <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
-  4195110916U,	// <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
-  3696779775U,	// <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
-  1175212130U,	// <6,5,0,7>: Cost 2 vrev <5,6,7,0>
-  1175285867U,	// <6,5,0,u>: Cost 2 vrev <5,6,u,0>
-  2248445988U,	// <6,5,1,0>: Cost 3 vrev <5,6,0,1>
-  3698107237U,	// <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
-  3708724118U,	// <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
-  3908575334U,	// <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
-  3716023376U,	// <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
-  3708724368U,	// <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
-  3767733960U,	// <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
-  2712571600U,	// <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
-  2712571609U,	// <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
-  2578391142U,	// <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
-  3704079934U,	// <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
-  3708724840U,	// <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
-  3705407182U,	// <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
-  2578394422U,	// <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
-  3717351272U,	// <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
-  2634983354U,	// <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
-  3115486518U,	// <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
-  2634983541U,	// <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
-  3708725398U,	// <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
-  3710052631U,	// <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
-  3708725606U,	// <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
-  3708725660U,	// <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
-  2643610114U,	// <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
-  3717352010U,	// <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
-  3773632358U,	// <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
-  2248978533U,	// <6,5,3,7>: Cost 3 vrev <5,6,7,3>
-  2249052270U,	// <6,5,3,u>: Cost 3 vrev <5,6,u,3>
-  2596323430U,	// <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
-  3716025328U,	// <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
-  3716688961U,	// <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
-  2643610770U,	// <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
-  2596326710U,	// <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
-  2634984758U,	// <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  3767734199U,	// <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
-  1643696070U,	// <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
-  1643769807U,	// <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
-  2578415718U,	// <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
-  3652158198U,	// <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
-  3652159080U,	// <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
-  3652159638U,	// <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
-  2578418998U,	// <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
-  2712571908U,	// <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
-  2718027790U,	// <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
-  2712571928U,	// <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
-  2712571937U,	// <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
-  2705346596U,	// <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
-  3767144496U,	// <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
-  3773116473U,	// <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
-  2705346626U,	// <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
-  2705346636U,	// <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
-  3908577217U,	// <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
-  2578428728U,	// <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
-  2712572002U,	// <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
-  2705346668U,	// <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
-  2560516198U,	// <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560517363U,	// <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
-  2566490060U,	// <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
-  3634260118U,	// <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
-  2560519478U,	// <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
-  2980498650U,	// <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
-  2980497922U,	// <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  3103214902U,	// <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
-  2560522030U,	// <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560524390U,	// <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
-  2560525556U,	// <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
-  2566498253U,	// <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
-  2646931439U,	// <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
-  2560527670U,	// <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
-  2634987674U,	// <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  2980506114U,	// <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  1175277674U,	// <6,5,u,7>: Cost 2 vrev <5,6,7,u>
-  1175351411U,	// <6,5,u,u>: Cost 2 vrev <5,6,u,u>
-  2578448486U,	// <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
-  1573191782U,	// <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2686030124U,	// <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
-  3779088690U,	// <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
-  2687209788U,	// <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
-  3652194000U,	// <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
-  2254852914U,	// <6,6,0,6>: Cost 3 vrev <6,6,6,0>
-  4041575734U,	// <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
-  1573192349U,	// <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
-  2646934262U,	// <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
-  2646934324U,	// <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
-  2646934422U,	// <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
-  2846785638U,	// <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3760951694U,	// <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
-  2646934672U,	// <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
-  2712572320U,	// <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
-  3775549865U,	// <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
-  2846785643U,	// <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3759772094U,	// <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
-  3704751676U,	// <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
-  2631009936U,	// <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
-  2646935206U,	// <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
-  3759772127U,	// <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
-  3704752004U,	// <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
-  2646935482U,	// <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
-  2712572410U,	// <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
-  2712572419U,	// <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
-  2646935702U,	// <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
-  3777024534U,	// <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
-  3704752453U,	// <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
-  2646935964U,	// <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
-  2705347122U,	// <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
-  3779678778U,	// <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
-  2657553069U,	// <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
-  4039609654U,	// <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
-  2708001366U,	// <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
-  2578481254U,	// <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
-  3652223734U,	// <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
-  3760951922U,	// <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
-  3779089019U,	// <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
-  1570540772U,	// <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1573195062U,	// <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  2712572560U,	// <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
-  2723410591U,	// <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
-  1573195304U,	// <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
-  3640287334U,	// <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
-  2646937296U,	// <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
-  3640289235U,	// <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
-  3720679279U,	// <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
-  2646937542U,	// <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
-  2646937604U,	// <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
-  2646937698U,	// <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
-  2846788918U,	// <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
-  2846788919U,	// <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
-  1516699750U,	// <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  2590442230U,	// <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
-  2646938106U,	// <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
-  2590443670U,	// <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
-  1516703030U,	// <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  2590445264U,	// <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
-  296144182U,	// <6,6,6,6>: Cost 1 vdup2 RHS
-  2712572738U,	// <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
-  296144182U,	// <6,6,6,u>: Cost 1 vdup2 RHS
-  2566561894U,	// <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
-  3634332924U,	// <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
-  2566563797U,	// <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
-  2584480258U,	// <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
-  2566565174U,	// <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
-  2717438846U,	// <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
-  2980500280U,	// <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
-  1906756918U,	// <6,6,7,7>: Cost 2 vzipr RHS, RHS
-  1906756919U,	// <6,6,7,u>: Cost 2 vzipr RHS, RHS
-  1516699750U,	// <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  1573197614U,	// <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2566571990U,	// <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
-  2846786205U,	// <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  1516703030U,	// <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  1573197978U,	// <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  296144182U,	// <6,6,u,6>: Cost 1 vdup2 RHS
-  1906765110U,	// <6,6,u,7>: Cost 2 vzipr RHS, RHS
-  296144182U,	// <6,6,u,u>: Cost 1 vdup2 RHS
-  1571209216U,	// <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497467494U,	// <6,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571209380U,	// <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2644951292U,	// <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
-  1571209554U,	// <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510756450U,	// <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
-  2644951542U,	// <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  2584499194U,	// <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
-  497468061U,	// <6,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571209974U,	// <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571210036U,	// <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571210134U,	// <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1571210200U,	// <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2644952098U,	// <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
-  1571210384U,	// <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644952271U,	// <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2578535418U,	// <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
-  1571210605U,	// <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
-  2644952509U,	// <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
-  2644952582U,	// <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571210856U,	// <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571210918U,	// <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2644952828U,	// <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
-  2633009028U,	// <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
-  1571211194U,	// <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2668840938U,	// <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
-  1571211323U,	// <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571211414U,	// <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644953311U,	// <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2644953390U,	// <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
-  1571211676U,	// <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571211778U,	// <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2644953648U,	// <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
-  2644953720U,	// <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
-  2644953795U,	// <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571212062U,	// <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1573202834U,	// <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644954058U,	// <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  2644954166U,	// <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2644954258U,	// <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
-  1571212496U,	// <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497470774U,	// <6,7,4,5>: Cost 1 vext2 RHS, RHS
-  1573203316U,	// <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2646281688U,	// <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
-  497471017U,	// <6,7,4,u>: Cost 1 vext2 RHS, RHS
-  2644954696U,	// <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1573203664U,	// <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2644954878U,	// <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2644954991U,	// <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571213254U,	// <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571213316U,	// <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571213410U,	// <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1573204136U,	// <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1573204217U,	// <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  2644955425U,	// <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
-  2644955561U,	// <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
-  1573204474U,	// <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2644955698U,	// <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  2644955789U,	// <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
-  2644955889U,	// <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
-  1571214136U,	// <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571214158U,	// <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1573204895U,	// <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1573204986U,	// <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2572608656U,	// <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
-  2644956362U,	// <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
-  2572610231U,	// <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
-  1573205350U,	// <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  2646947220U,	// <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
-  1516786498U,	// <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
-  1571214956U,	// <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
-  1573205634U,	// <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
-  1571215059U,	// <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497473326U,	// <6,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571215237U,	// <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571215292U,	// <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571215423U,	// <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497473690U,	// <6,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571215568U,	// <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  1573206272U,	// <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
-  497473893U,	// <6,7,u,u>: Cost 1 vext2 RHS, LHS
-  1571217408U,	// <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497475686U,	// <6,u,0,1>: Cost 1 vext2 RHS, LHS
-  1571217572U,	// <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2689865445U,	// <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
-  1571217746U,	// <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510830187U,	// <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
-  2644959734U,	// <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  1193130221U,	// <6,u,0,7>: Cost 2 vrev <u,6,7,0>
-  497476253U,	// <6,u,0,u>: Cost 1 vext2 RHS, LHS
-  1571218166U,	// <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571218228U,	// <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1612289838U,	// <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571218392U,	// <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2566663478U,	// <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
-  1571218576U,	// <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644960463U,	// <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2717439835U,	// <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
-  1612289892U,	// <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  1504870502U,	// <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
-  2644960774U,	// <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571219048U,	// <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571219110U,	// <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  1504873782U,	// <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
-  2633017221U,	// <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
-  1571219386U,	// <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2712573868U,	// <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
-  1571219515U,	// <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571219606U,	// <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644961503U,	// <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2566678499U,	// <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
-  1571219868U,	// <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571219970U,	// <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2689865711U,	// <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
-  2708002806U,	// <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
-  2644961987U,	// <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571220254U,	// <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571220370U,	// <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644962250U,	// <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  1661245476U,	// <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
-  2686031917U,	// <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
-  1571220688U,	// <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497478967U,	// <6,u,4,5>: Cost 1 vext2 RHS, RHS
-  1571220852U,	// <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1661614161U,	// <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
-  497479209U,	// <6,u,4,u>: Cost 1 vext2 RHS, RHS
-  2566692966U,	// <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
-  1571221200U,	// <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2566694885U,	// <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
-  2689865855U,	// <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
-  1571221446U,	// <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571221508U,	// <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1612290202U,	// <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  1571221672U,	// <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1612290220U,	// <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504903270U,	// <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
-  2644963752U,	// <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571222010U,	// <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2686032080U,	// <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
-  1504906550U,	// <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
-  2644964079U,	// <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
-  296144182U,	// <6,u,6,6>: Cost 1 vdup2 RHS
-  1571222350U,	// <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  296144182U,	// <6,u,6,u>: Cost 1 vdup2 RHS
-  1492967526U,	// <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
-  2560738574U,	// <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
-  1492969447U,	// <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
-  1906753692U,	// <6,u,7,3>: Cost 2 vzipr RHS, LHS
-  1492970806U,	// <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
-  2980495761U,	// <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
-  1516860235U,	// <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
-  1906756936U,	// <6,u,7,7>: Cost 2 vzipr RHS, RHS
-  1492973358U,	// <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
-  1492975718U,	// <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
-  497481518U,	// <6,u,u,1>: Cost 1 vext2 RHS, LHS
-  1612290405U,	// <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571223484U,	// <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1492978998U,	// <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
-  497481882U,	// <6,u,u,5>: Cost 1 vext2 RHS, RHS
-  296144182U,	// <6,u,u,6>: Cost 1 vdup2 RHS
-  1906765128U,	// <6,u,u,7>: Cost 2 vzipr RHS, RHS
-  497482085U,	// <6,u,u,u>: Cost 1 vext2 RHS, LHS
-  1638318080U,	// <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638318090U,	// <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
-  1638318100U,	// <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
-  3646442178U,	// <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
-  2712059941U,	// <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
-  2651603364U,	// <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
-  2590618445U,	// <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
-  3785801798U,	// <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
-  1638318153U,	// <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
-  1516879974U,	// <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
-  2693922911U,	// <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
-  564576358U,	// <7,0,1,2>: Cost 1 vext3 RHS, LHS
-  2638996480U,	// <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
-  1516883254U,	// <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
-  2649613456U,	// <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
-  1516884814U,	// <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
-  2590626808U,	// <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
-  564576412U,	// <7,0,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U,	// <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2692743344U,	// <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
-  2712060084U,	// <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
-  2712060094U,	// <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
-  1638318284U,	// <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712060118U,	// <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2651604922U,	// <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
-  2686255336U,	// <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
-  1638318316U,	// <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
-  2651605142U,	// <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
-  2712060156U,	// <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
-  2712060165U,	// <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
-  2651605404U,	// <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
-  2651605506U,	// <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
-  2638998111U,	// <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
-  2639661744U,	// <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
-  3712740068U,	// <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
-  2640989010U,	// <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
-  2712060232U,	// <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
-  1638318418U,	// <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
-  1638318428U,	// <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
-  3646474950U,	// <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
-  2712060270U,	// <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
-  1577864502U,	// <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  2651606388U,	// <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
-  3787792776U,	// <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
-  1638318481U,	// <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
-  2590654566U,	// <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
-  2651606736U,	// <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
-  2712060334U,	// <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649616239U,	// <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
-  2651606982U,	// <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
-  2651607044U,	// <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
-  1577865314U,	// <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
-  2651607208U,	// <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
-  1579192580U,	// <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
-  2688393709U,	// <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
-  2712060406U,	// <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  2688541183U,	// <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
-  2655588936U,	// <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
-  3762430481U,	// <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
-  2651607730U,	// <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
-  2651607864U,	// <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
-  2651607886U,	// <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
-  2688983605U,	// <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
-  2651608058U,	// <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
-  2932703334U,	// <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
-  3066921062U,	// <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
-  3712742678U,	// <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
-  2651608422U,	// <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
-  2651608513U,	// <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
-  2663552532U,	// <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
-  2651608684U,	// <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
-  2651608706U,	// <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
-  1638318730U,	// <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
-  1638318738U,	// <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
-  564576925U,	// <7,0,u,2>: Cost 1 vext3 RHS, LHS
-  2572765898U,	// <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
-  1638318770U,	// <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
-  1577867418U,	// <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  1516942165U,	// <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
-  2651609344U,	// <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
-  564576979U,	// <7,0,u,u>: Cost 1 vext3 RHS, LHS
-  2590687334U,	// <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
-  2639003750U,	// <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
-  2793357414U,	// <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
-  1638318838U,	// <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
-  2590690614U,	// <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
-  2712060679U,	// <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2590692182U,	// <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
-  3785802521U,	// <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
-  1638318883U,	// <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
-  2712060715U,	// <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
-  1638318900U,	// <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  3774300994U,	// <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
-  1638318920U,	// <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
-  2712060755U,	// <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
-  2691416926U,	// <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
-  2590700375U,	// <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
-  3765158766U,	// <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
-  1638318965U,	// <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
-  2712060796U,	// <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
-  2712060807U,	// <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
-  3712747112U,	// <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
-  1638318998U,	// <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
-  2712060836U,	// <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
-  2712060843U,	// <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
-  2590708568U,	// <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
-  2735948730U,	// <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
-  1638319043U,	// <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
-  2712060876U,	// <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
-  1638319064U,	// <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2712060894U,	// <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
-  2692596718U,	// <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
-  2712060917U,	// <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
-  1619002368U,	// <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
-  2692817929U,	// <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
-  2735948814U,	// <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
-  1619223579U,	// <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
-  2712060962U,	// <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
-  2712060971U,	// <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
-  2712060980U,	// <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
-  2712060989U,	// <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
-  3785802822U,	// <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
-  2639007030U,	// <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
-  2645642634U,	// <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
-  3719384520U,	// <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
-  2639007273U,	// <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
-  2572812390U,	// <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
-  2693776510U,	// <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
-  3774301318U,	// <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
-  1620182160U,	// <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
-  2572815670U,	// <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
-  3766486178U,	// <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
-  2651615331U,	// <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
-  2652278964U,	// <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
-  1620550845U,	// <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
-  3768108230U,	// <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
-  2694440143U,	// <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
-  2712061144U,	// <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2694587617U,	// <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
-  3768403178U,	// <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
-  2694735091U,	// <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
-  3768550652U,	// <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
-  2652279630U,	// <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
-  2694956302U,	// <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
-  2645644282U,	// <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
-  2859062094U,	// <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
-  3779462437U,	// <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
-  3121938534U,	// <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2554916150U,	// <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
-  3769140548U,	// <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
-  3726022164U,	// <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
-  2554918508U,	// <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
-  3121938539U,	// <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2572836966U,	// <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
-  1638319469U,	// <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
-  2712061299U,	// <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
-  1622173059U,	// <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
-  2572840246U,	// <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
-  1622320533U,	// <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
-  2696136094U,	// <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
-  2859060777U,	// <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
-  1622541744U,	// <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
-  2712061364U,	// <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
-  2712061373U,	// <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
-  2712061380U,	// <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
-  2712061389U,	// <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
-  2712061404U,	// <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
-  2696725990U,	// <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
-  2712061417U,	// <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
-  3785803251U,	// <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
-  2696947201U,	// <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
-  2712061446U,	// <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
-  3785803276U,	// <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
-  3785803285U,	// <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
-  2712061471U,	// <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
-  2712061482U,	// <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
-  3766486576U,	// <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
-  2712061500U,	// <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
-  2602718850U,	// <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  2712061516U,	// <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
-  2712061525U,	// <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
-  2712061536U,	// <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
-  1638319720U,	// <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638319730U,	// <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
-  2712061565U,	// <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
-  2698053256U,	// <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
-  2712061584U,	// <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
-  3771795096U,	// <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
-  1638319775U,	// <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
-  1638319782U,	// <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
-  2693924531U,	// <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
-  2700560061U,	// <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
-  2693924551U,	// <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
-  1638319822U,	// <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
-  2698716889U,	// <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
-  2712061665U,	// <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
-  2735949540U,	// <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
-  1638319854U,	// <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
-  2712061692U,	// <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
-  2712061698U,	// <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
-  2712061708U,	// <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
-  2712061718U,	// <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
-  2712061728U,	// <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
-  2699380522U,	// <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
-  2712061740U,	// <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
-  3809691445U,	// <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
-  2699601733U,	// <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
-  2699675470U,	// <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
-  3766486867U,	// <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
-  2699822944U,	// <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
-  2692745065U,	// <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
-  2699970418U,	// <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
-  3766486907U,	// <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
-  2700117892U,	// <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
-  3771795334U,	// <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
-  2692745110U,	// <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
-  2572894310U,	// <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
-  2712061860U,	// <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
-  2700486577U,	// <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
-  1626818490U,	// <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
-  2572897590U,	// <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
-  2700707788U,	// <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
-  2700781525U,	// <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
-  3774597086U,	// <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
-  1627187175U,	// <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
-  2735949802U,	// <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
-  3780200434U,	// <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
-  3773564928U,	// <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
-  2986541158U,	// <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
-  2554989878U,	// <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
-  3775113245U,	// <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
-  4060283228U,	// <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
-  2554992236U,	// <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
-  2986541163U,	// <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
-  1638320187U,	// <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
-  2693924936U,	// <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
-  1638319720U,	// <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1628145756U,	// <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
-  1638320227U,	// <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
-  2702035054U,	// <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
-  2702108791U,	// <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
-  2735949945U,	// <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
-  1628514441U,	// <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
-  2712062091U,	// <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
-  1638320278U,	// <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
-  2712062109U,	// <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
-  2590836886U,	// <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
-  2712062128U,	// <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
-  2712062138U,	// <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
-  2590839656U,	// <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
-  3311414017U,	// <7,3,0,7>: Cost 4 vrev <3,7,7,0>
-  1638320341U,	// <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
-  2237164227U,	// <7,3,1,0>: Cost 3 vrev <3,7,0,1>
-  2712062182U,	// <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
-  2712062193U,	// <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
-  2692745468U,	// <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
-  2712062214U,	// <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
-  2693925132U,	// <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
-  3768183059U,	// <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
-  2692745504U,	// <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
-  2696063273U,	// <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
-  2712062254U,	// <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
-  2712062262U,	// <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
-  2712062273U,	// <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
-  2712062280U,	// <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
-  2712062294U,	// <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
-  2712062302U,	// <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
-  2700560742U,	// <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
-  2712062319U,	// <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
-  2712062325U,	// <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
-  2712062335U,	// <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
-  2636368158U,	// <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
-  2637031791U,	// <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
-  1638320540U,	// <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062374U,	// <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
-  2704689586U,	// <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
-  2590864235U,	// <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
-  2704837060U,	// <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
-  1638320540U,	// <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062416U,	// <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
-  2712062426U,	// <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
-  2566981640U,	// <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
-  2712062447U,	// <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
-  2712062456U,	// <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
-  1638320642U,	// <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
-  2648313204U,	// <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
-  3311446789U,	// <7,3,4,7>: Cost 4 vrev <3,7,7,4>
-  1638320669U,	// <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
-  2602819686U,	// <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
-  1574571728U,	// <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
-  2648977185U,	// <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
-  2705869378U,	// <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
-  2237491947U,	// <7,3,5,4>: Cost 3 vrev <3,7,4,5>
-  2706016852U,	// <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
-  2648313954U,	// <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
-  2692745823U,	// <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
-  1579217159U,	// <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
-  2706311800U,	// <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
-  2654286249U,	// <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
-  1581208058U,	// <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
-  2706533011U,	// <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
-  2706606748U,	// <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
-  3780422309U,	// <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
-  2712062637U,	// <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
-  2706827959U,	// <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
-  1585189856U,	// <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
-  2693925571U,	// <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
-  2693925584U,	// <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
-  2700561114U,	// <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
-  2572978916U,	// <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
-  2693925611U,	// <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
-  2707344118U,	// <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
-  2654950894U,	// <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
-  2648315500U,	// <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
-  2693925643U,	// <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
-  2237221578U,	// <7,3,u,0>: Cost 3 vrev <3,7,0,u>
-  1638320926U,	// <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
-  1593153452U,	// <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
-  1638320540U,	// <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2237516526U,	// <7,3,u,4>: Cost 3 vrev <3,7,4,u>
-  1638320966U,	// <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
-  2712062796U,	// <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
-  2692967250U,	// <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
-  1638320989U,	// <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
-  2651635712U,	// <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
-  1577893990U,	// <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2651635876U,	// <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
-  3785804672U,	// <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
-  2651636050U,	// <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
-  1638468498U,	// <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638468508U,	// <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787795364U,	// <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1640459181U,	// <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
-  2651636470U,	// <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
-  2651636532U,	// <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
-  2712062922U,	// <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
-  2639029248U,	// <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
-  2712062940U,	// <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
-  2712062946U,	// <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
-  2712062958U,	// <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
-  3785804791U,	// <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
-  2712062973U,	// <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
-  3785804807U,	// <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
-  3785804818U,	// <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
-  2651637352U,	// <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
-  2651637414U,	// <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
-  3716753194U,	// <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
-  2712063030U,	// <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  2712063036U,	// <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
-  3773123658U,	// <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
-  2712063054U,	// <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
-  2651637910U,	// <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
-  3712772348U,	// <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
-  3785804906U,	// <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
-  2651638172U,	// <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
-  2651638274U,	// <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
-  2639030883U,	// <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
-  2712063122U,	// <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
-  3712772836U,	// <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
-  2641021782U,	// <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
-  2714053802U,	// <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
-  3785804978U,	// <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
-  3716754505U,	// <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
-  3785804998U,	// <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
-  1638321360U,	// <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638468826U,	// <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
-  1638468836U,	// <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  3785215214U,	// <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
-  1640459509U,	// <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
-  1517207654U,	// <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
-  2573034640U,	// <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
-  2712063246U,	// <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
-  2573036267U,	// <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
-  1517210934U,	// <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
-  2711989549U,	// <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
-  564579638U,	// <7,4,5,6>: Cost 1 vext3 RHS, RHS
-  2651639976U,	// <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
-  564579656U,	// <7,4,5,u>: Cost 1 vext3 RHS, RHS
-  2712063307U,	// <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
-  3767668056U,	// <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
-  2651640314U,	// <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
-  2655621708U,	// <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
-  1638468980U,	// <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712063358U,	// <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
-  2712063367U,	// <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
-  2712210826U,	// <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1638469012U,	// <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
-  2651640826U,	// <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
-  3773713830U,	// <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
-  3773713842U,	// <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
-  3780349372U,	// <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
-  2651641140U,	// <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
-  2712210888U,	// <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  2712210898U,	// <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
-  2651641452U,	// <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
-  2713538026U,	// <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
-  1517232230U,	// <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
-  1577899822U,	// <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2712063489U,	// <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
-  2573060846U,	// <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
-  1640312342U,	// <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
-  1638469146U,	// <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
-  564579881U,	// <7,4,u,6>: Cost 1 vext3 RHS, RHS
-  2714054192U,	// <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
-  564579899U,	// <7,4,u,u>: Cost 1 vext3 RHS, RHS
-  2579038310U,	// <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
-  2636382310U,	// <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2796339302U,	// <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
-  3646810719U,	// <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
-  2712063586U,	// <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
-  2735951467U,	// <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
-  2735951476U,	// <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
-  2579043322U,	// <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
-  2636382877U,	// <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211087U,	// <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
-  3698180916U,	// <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
-  3710124950U,	// <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
-  2636383232U,	// <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
-  2712211127U,	// <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
-  2590994128U,	// <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
-  2590995323U,	// <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
-  1638469328U,	// <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638469337U,	// <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  3785805536U,	// <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
-  3785805544U,	// <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
-  3704817288U,	// <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
-  2712063742U,	// <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
-  3716761386U,	// <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
-  2714054415U,	// <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  3774304024U,	// <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
-  2712063777U,	// <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
-  2712063787U,	// <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
-  3634888806U,	// <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
-  2636384544U,	// <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
-  3710790001U,	// <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
-  3710126492U,	// <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
-  3634892086U,	// <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
-  2639039076U,	// <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
-  3713444533U,	// <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
-  2693926767U,	// <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
-  2712063864U,	// <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
-  2579071078U,	// <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
-  3646841856U,	// <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
-  3716762698U,	// <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
-  3646843491U,	// <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
-  2579074358U,	// <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
-  2636385590U,	// <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
-  2645675406U,	// <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
-  1638322118U,	// <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1638469583U,	// <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
-  2714054611U,	// <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
-  2652974800U,	// <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
-  3710127905U,	// <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
-  3785805808U,	// <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
-  2712211450U,	// <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
-  1638322180U,	// <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
-  2712064014U,	// <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638469656U,	// <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  1638469665U,	// <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
-  2712064036U,	// <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
-  2714054707U,	// <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
-  3785805879U,	// <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
-  2712064066U,	// <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
-  2712064076U,	// <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
-  2714054743U,	// <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712064096U,	// <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  1638322274U,	// <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
-  1638469739U,	// <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
-  1511325798U,	// <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
-  2692747392U,	// <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
-  2585069160U,	// <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
-  2573126390U,	// <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
-  1511329078U,	// <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
-  1638469800U,	// <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712211626U,	// <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2712211636U,	// <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
-  1638469823U,	// <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
-  1511333990U,	// <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
-  2636388142U,	// <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211671U,	// <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
-  2573134583U,	// <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
-  1511337270U,	// <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
-  1638469881U,	// <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
-  2712064258U,	// <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
-  1638469892U,	// <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
-  1638469904U,	// <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
-  2650324992U,	// <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
-  1576583270U,	// <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712064300U,	// <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
-  2255295336U,	// <7,6,0,3>: Cost 3 vrev <6,7,3,0>
-  2712064316U,	// <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
-  2585088098U,	// <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
-  2735952204U,	// <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
-  2712211799U,	// <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
-  1576583837U,	// <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
-  1181340494U,	// <7,6,1,0>: Cost 2 vrev <6,7,0,1>
-  2650325812U,	// <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
-  2650325910U,	// <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
-  2650325976U,	// <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
-  2579123510U,	// <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
-  2650326160U,	// <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
-  2714055072U,	// <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2712064425U,	// <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
-  1181930390U,	// <7,6,1,u>: Cost 2 vrev <6,7,u,1>
-  2712211897U,	// <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
-  2714055108U,	// <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
-  2650326632U,	// <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
-  2650326694U,	// <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
-  2714055137U,	// <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
-  2714055148U,	// <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
-  2650326970U,	// <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
-  1638470138U,	// <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638470147U,	// <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2650327190U,	// <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
-  2255172441U,	// <7,6,3,1>: Cost 3 vrev <6,7,1,3>
-  2255246178U,	// <7,6,3,2>: Cost 3 vrev <6,7,2,3>
-  2650327452U,	// <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
-  2712064562U,	// <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
-  2650327627U,	// <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
-  3713452726U,	// <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
-  2700563016U,	// <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
-  2712064593U,	// <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
-  2650327954U,	// <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
-  2735952486U,	// <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
-  2735952497U,	// <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
-  2255328108U,	// <7,6,4,3>: Cost 3 vrev <6,7,3,4>
-  2712212100U,	// <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
-  1576586550U,	// <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  2714055312U,	// <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
-  2712212126U,	// <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
-  1576586793U,	// <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
-  2579152998U,	// <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
-  2650328784U,	// <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
-  2714055364U,	// <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
-  3785806538U,	// <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
-  1576587206U,	// <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
-  2650329092U,	// <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
-  2650329186U,	// <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
-  2712064753U,	// <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
-  1181963162U,	// <7,6,5,u>: Cost 2 vrev <6,7,u,5>
-  2714055421U,	// <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
-  2714055432U,	// <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
-  2650329594U,	// <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
-  3785806619U,	// <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
-  2712212260U,	// <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
-  2714055472U,	// <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
-  1638323000U,	// <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470466U,	// <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  1638470475U,	// <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
-  1638323022U,	// <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
-  2712064854U,	// <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
-  2712064865U,	// <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
-  2712064872U,	// <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
-  1638323062U,	// <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
-  2712064894U,	// <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
-  2712064905U,	// <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
-  2712064915U,	// <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
-  1638323094U,	// <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
-  1638470559U,	// <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
-  1576589102U,	// <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712212402U,	// <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
-  2712212409U,	// <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
-  1638470599U,	// <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
-  1576589466U,	// <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  1638323000U,	// <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470624U,	// <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
-  1638470631U,	// <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
-  2712065007U,	// <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
-  1638323194U,	// <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
-  2712065025U,	// <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
-  3646958337U,	// <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
-  2712065044U,	// <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
-  2585161907U,	// <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
-  2591134604U,	// <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
-  2591134714U,	// <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
-  1638323257U,	// <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
-  2712065091U,	// <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
-  2712065098U,	// <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
-  2712065109U,	// <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
-  2692748384U,	// <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
-  2585169206U,	// <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
-  2693928048U,	// <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
-  2585170766U,	// <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
-  2735953024U,	// <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
-  2695918731U,	// <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
-  3770471574U,	// <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
-  3785807002U,	// <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
-  2712065189U,	// <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
-  2712065196U,	// <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
-  3773125818U,	// <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
-  3766490305U,	// <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
-  2700563658U,	// <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
-  2735953107U,	// <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
-  2701890780U,	// <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
-  2712065251U,	// <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
-  3766490350U,	// <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
-  3774305530U,	// <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
-  2637728196U,	// <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
-  2712065291U,	// <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
-  2585186486U,	// <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
-  2639719095U,	// <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
-  2640382728U,	// <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
-  2641046361U,	// <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
-  2712212792U,	// <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
-  3646989312U,	// <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
-  3785807176U,	// <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
-  3646991109U,	// <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
-  2712065371U,	// <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
-  1638323558U,	// <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
-  2712212845U,	// <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
-  2591167846U,	// <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
-  1638323585U,	// <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
-  2585198694U,	// <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
-  2712212884U,	// <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
-  3711471393U,	// <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
-  2649673590U,	// <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
-  2712065455U,	// <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
-  1577259032U,	// <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
-  2712065473U,	// <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
-  2712212936U,	// <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
-  1579249931U,	// <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
-  2591178854U,	// <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
-  2735953374U,	// <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
-  2712212974U,	// <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
-  2655646287U,	// <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
-  2591182134U,	// <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
-  2656973553U,	// <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
-  1583895362U,	// <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
-  2712065556U,	// <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
-  1585222628U,	// <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
-  1523417190U,	// <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  2597159670U,	// <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
-  2597160552U,	// <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
-  2597161110U,	// <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
-  1523420470U,	// <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  2651002296U,	// <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
-  2657637906U,	// <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
-  363253046U,	// <7,7,7,7>: Cost 1 vdup3 RHS
-  363253046U,	// <7,7,7,u>: Cost 1 vdup3 RHS
-  1523417190U,	// <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  1638471298U,	// <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
-  2712213132U,	// <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
-  2712213138U,	// <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
-  1523420470U,	// <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  1638471338U,	// <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
-  1595840756U,	// <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
-  363253046U,	// <7,7,u,7>: Cost 1 vdup3 RHS
-  363253046U,	// <7,7,u,u>: Cost 1 vdup3 RHS
-  1638318080U,	// <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638323923U,	// <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
-  1662211804U,	// <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
-  1638323941U,	// <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
-  2712065773U,	// <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
-  1662359286U,	// <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
-  1662359296U,	// <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  2987150664U,	// <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
-  1638323986U,	// <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
-  1517469798U,	// <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
-  1638318900U,	// <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  564582190U,	// <7,u,1,2>: Cost 1 vext3 RHS, LHS
-  1638324023U,	// <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
-  1517473078U,	// <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
-  2693928777U,	// <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
-  1517474710U,	// <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
-  1640462171U,	// <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  564582244U,	// <7,u,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U,	// <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2712065907U,	// <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
-  1638319720U,	// <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638324101U,	// <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
-  1638318284U,	// <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712065947U,	// <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
-  2700564387U,	// <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
-  1640314796U,	// <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  1638324146U,	// <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
-  1638324156U,	// <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
-  1638319064U,	// <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2700564435U,	// <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
-  1638320540U,	// <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  1638324196U,	// <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
-  1638324207U,	// <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
-  2700564472U,	// <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
-  2695919610U,	// <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
-  1638324228U,	// <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
-  2712066061U,	// <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
-  1662212122U,	// <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
-  1662212132U,	// <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
-  2712066092U,	// <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
-  1638321360U,	// <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638324287U,	// <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
-  1662359624U,	// <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
-  1640314961U,	// <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  1638324314U,	// <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
-  1517502566U,	// <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
-  1574612693U,	// <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
-  2712066162U,	// <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
-  1638324351U,	// <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
-  1576603592U,	// <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
-  1577267225U,	// <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
-  564582554U,	// <7,u,5,6>: Cost 1 vext3 RHS, RHS
-  1640462499U,	// <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
-  564582572U,	// <7,u,5,u>: Cost 1 vext3 RHS, RHS
-  2712066223U,	// <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
-  2712066238U,	// <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
-  1581249023U,	// <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
-  1638324432U,	// <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
-  1638468980U,	// <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712066274U,	// <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
-  1583903555U,	// <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
-  1640315117U,	// <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
-  1638324477U,	// <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
-  1638471936U,	// <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
-  2692970763U,	// <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
-  2700933399U,	// <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
-  2573347601U,	// <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
-  1638471976U,	// <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
-  1511551171U,	// <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
-  2712213815U,	// <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
-  363253046U,	// <7,u,7,7>: Cost 1 vdup3 RHS
-  363253046U,	// <7,u,7,u>: Cost 1 vdup3 RHS
-  1638324561U,	// <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
-  1638324571U,	// <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
-  564582757U,	// <7,u,u,2>: Cost 1 vext3 RHS, LHS
-  1638324587U,	// <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
-  1638324601U,	// <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
-  1638324611U,	// <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
-  564582797U,	// <7,u,u,6>: Cost 1 vext3 RHS, RHS
-  363253046U,	// <7,u,u,7>: Cost 1 vdup3 RHS
-  564582811U,	// <7,u,u,u>: Cost 1 vext3 RHS, LHS
-  135053414U,	// <u,0,0,0>: Cost 1 vdup0 LHS
-  1611489290U,	// <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611489300U,	// <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  2568054923U,	// <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1481706806U,	// <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
-  2555449040U,	// <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
-  2591282078U,	// <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
-  2591945711U,	// <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U,	// <u,0,0,u>: Cost 1 vdup0 LHS
-  1493655654U,	// <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
-  1860550758U,	// <u,0,1,1>: Cost 2 vzipl LHS, LHS
-  537747563U,	// <u,0,1,2>: Cost 1 vext3 LHS, LHS
-  2625135576U,	// <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
-  1493658934U,	// <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
-  2625135760U,	// <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
-  1517548447U,	// <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
-  2591290362U,	// <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
-  537747612U,	// <u,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611489444U,	// <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685231276U,	// <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  1994768486U,	// <u,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2685231294U,	// <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611489484U,	// <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2712068310U,	// <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2625136570U,	// <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
-  2591962097U,	// <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1611489516U,	// <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2954067968U,	// <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2685231356U,	// <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  72589981U,	// <u,0,3,2>: Cost 1 vrev LHS
-  2625137052U,	// <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
-  2625137154U,	// <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
-  2639071848U,	// <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
-  2639735481U,	// <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
-  2597279354U,	// <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
-  73032403U,	// <u,0,3,u>: Cost 1 vrev LHS
-  2687074636U,	// <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
-  1611489618U,	// <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611489628U,	// <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3629222038U,	// <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
-  2555481398U,	// <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
-  1551396150U,	// <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  2651680116U,	// <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
-  2646150600U,	// <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1611932050U,	// <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2561458278U,	// <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
-  1863532646U,	// <u,0,5,1>: Cost 2 vzipl RHS, LHS
-  2712068526U,	// <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649689976U,	// <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
-  2220237489U,	// <u,0,5,4>: Cost 3 vrev <0,u,4,5>
-  2651680772U,	// <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
-  1577939051U,	// <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
-  2830077238U,	// <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  1579266317U,	// <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
-  2555494502U,	// <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
-  2712068598U,	// <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  1997750374U,	// <u,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2655662673U,	// <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
-  2555497782U,	// <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
-  2651681459U,	// <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
-  2651681592U,	// <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
-  2651681614U,	// <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
-  1997750428U,	// <u,0,6,u>: Cost 2 vtrnl RHS, LHS
-  2567446630U,	// <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
-  2567447446U,	// <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
-  2567448641U,	// <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
-  2573421338U,	// <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
-  2567449910U,	// <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
-  2651682242U,	// <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
-  2591339429U,	// <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
-  2651682412U,	// <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
-  2567452462U,	// <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
-  135053414U,	// <u,0,u,0>: Cost 1 vdup0 LHS
-  1611489938U,	// <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537748125U,	// <u,0,u,2>: Cost 1 vext3 LHS, LHS
-  2685674148U,	// <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1611932338U,	// <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551399066U,	// <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  1517605798U,	// <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
-  2830077481U,	// <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  537748179U,	// <u,0,u,u>: Cost 1 vext3 LHS, LHS
-  1544101961U,	// <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
-  1558036582U,	// <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
-  2619171051U,	// <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
-  1611490038U,	// <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2555522358U,	// <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
-  2712068871U,	// <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2591355815U,	// <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
-  2597328512U,	// <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
-  1611490083U,	// <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  1481785446U,	// <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
-  202162278U,	// <u,1,1,1>: Cost 1 vdup1 LHS
-  2555528808U,	// <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
-  1611490120U,	// <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  1481788726U,	// <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
-  2689876828U,	// <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  2591364008U,	// <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
-  2592691274U,	// <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U,	// <u,1,1,u>: Cost 1 vdup1 LHS
-  1499709542U,	// <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
-  2689876871U,	// <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2631116445U,	// <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
-  835584U,	// <u,1,2,3>: Cost 0 copy LHS
-  1499712822U,	// <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
-  2689876907U,	// <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  2631780282U,	// <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
-  1523603074U,	// <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
-  835584U,	// <u,1,2,u>: Cost 0 copy LHS
-  1487773798U,	// <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
-  1611490264U,	// <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685232094U,	// <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2018746470U,	// <u,1,3,3>: Cost 2 vtrnr LHS, LHS
-  1487777078U,	// <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
-  1611490304U,	// <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2685674505U,	// <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2640407307U,	// <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
-  1611490327U,	// <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  1567992749U,	// <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
-  2693121070U,	// <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
-  2693194807U,	// <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
-  1152386432U,	// <u,1,4,3>: Cost 2 vrev <1,u,3,4>
-  2555555126U,	// <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
-  1558039862U,	// <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
-  2645716371U,	// <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
-  2597361284U,	// <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
-  1152755117U,	// <u,1,4,u>: Cost 2 vrev <1,u,u,4>
-  1481818214U,	// <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
-  2555560694U,	// <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
-  2555561576U,	// <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
-  1611490448U,	// <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  1481821494U,	// <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
-  2651025435U,	// <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
-  2651689068U,	// <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
-  2823966006U,	// <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
-  1611932861U,	// <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  2555568230U,	// <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
-  2689877199U,	// <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2712069336U,	// <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2685232353U,	// <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  2555571510U,	// <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
-  2689877235U,	// <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  2657661765U,	// <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
-  1584583574U,	// <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
-  1585247207U,	// <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
-  2561548390U,	// <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
-  2561549681U,	// <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
-  2573493926U,	// <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
-  2042962022U,	// <u,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2561551670U,	// <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
-  2226300309U,	// <u,1,7,5>: Cost 3 vrev <1,u,5,7>
-  2658325990U,	// <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
-  2658326124U,	// <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
-  2042962027U,	// <u,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1481842790U,	// <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
-  202162278U,	// <u,1,u,1>: Cost 1 vdup1 LHS
-  2685674867U,	// <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  835584U,	// <u,1,u,3>: Cost 0 copy LHS
-  1481846070U,	// <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
-  1611933077U,	// <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685674910U,	// <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  1523652232U,	// <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
-  835584U,	// <u,1,u,u>: Cost 0 copy LHS
-  1544110154U,	// <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
-  1545437286U,	// <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  1545437420U,	// <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
-  2685232589U,	// <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2619179346U,	// <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
-  2712069606U,	// <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
-  2689877484U,	// <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2659656273U,	// <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
-  1545437853U,	// <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
-  1550082851U,	// <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
-  2619179828U,	// <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
-  2619179926U,	// <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
-  2685232671U,	// <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2555604278U,	// <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
-  2619180176U,	// <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
-  2689877564U,	// <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  2602718850U,	// <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  1158703235U,	// <u,2,1,u>: Cost 2 vrev <2,u,u,1>
-  1481867366U,	// <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
-  2555609846U,	// <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
-  269271142U,	// <u,2,2,2>: Cost 1 vdup2 LHS
-  1611490930U,	// <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  1481870646U,	// <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
-  2689877640U,	// <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2619180986U,	// <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
-  2593436837U,	// <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U,	// <u,2,2,u>: Cost 1 vdup2 LHS
-  408134301U,	// <u,2,3,0>: Cost 1 vext1 LHS, LHS
-  1481876214U,	// <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1481877096U,	// <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1880326246U,	// <u,2,3,3>: Cost 2 vzipr LHS, LHS
-  408137014U,	// <u,2,3,4>: Cost 1 vext1 LHS, RHS
-  1529654992U,	// <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1529655802U,	// <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1529656314U,	// <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408139566U,	// <u,2,3,u>: Cost 1 vext1 LHS, LHS
-  1567853468U,	// <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  2561598362U,	// <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
-  2555627214U,	// <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
-  2685232918U,	// <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2555628854U,	// <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
-  1545440566U,	// <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1571982740U,	// <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
-  2592125957U,	// <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1545440809U,	// <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
-  2555633766U,	// <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
-  2561606550U,	// <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
-  2689877856U,	// <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685233000U,	// <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1158441059U,	// <u,2,5,4>: Cost 2 vrev <2,u,4,5>
-  2645725188U,	// <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
-  2689877892U,	// <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2823900470U,	// <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
-  1158736007U,	// <u,2,5,u>: Cost 2 vrev <2,u,u,5>
-  1481900134U,	// <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
-  2555642614U,	// <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
-  2555643496U,	// <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
-  1611491258U,	// <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  1481903414U,	// <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
-  2689877964U,	// <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689877973U,	// <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2645726030U,	// <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
-  1611933671U,	// <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  1585919033U,	// <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
-  2573566710U,	// <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
-  2567596115U,	// <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
-  1906901094U,	// <u,2,7,3>: Cost 2 vzipr RHS, LHS
-  2555653430U,	// <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
-  2800080230U,	// <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
-  2980643164U,	// <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2645726828U,	// <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
-  1906901099U,	// <u,2,7,u>: Cost 2 vzipr RHS, LHS
-  408175266U,	// <u,2,u,0>: Cost 1 vext1 LHS, LHS
-  1545443118U,	// <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  269271142U,	// <u,2,u,2>: Cost 1 vdup2 LHS
-  1611491416U,	// <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  408177974U,	// <u,2,u,4>: Cost 1 vext1 LHS, RHS
-  1545443482U,	// <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1726339226U,	// <u,2,u,6>: Cost 2 vuzpl LHS, RHS
-  1529697274U,	// <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408180526U,	// <u,2,u,u>: Cost 1 vext1 LHS, LHS
-  1544781824U,	// <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  471040156U,	// <u,3,0,1>: Cost 1 vext2 LHS, LHS
-  1544781988U,	// <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2618523900U,	// <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
-  1544782162U,	// <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2238188352U,	// <u,3,0,5>: Cost 3 vrev <3,u,5,0>
-  2623169023U,	// <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2238335826U,	// <u,3,0,7>: Cost 3 vrev <3,u,7,0>
-  471040669U,	// <u,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544782582U,	// <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544782644U,	// <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544782742U,	// <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544782808U,	// <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618524733U,	// <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544782992U,	// <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618524897U,	// <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2703517987U,	// <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
-  1544783213U,	// <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  1529716838U,	// <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
-  1164167966U,	// <u,3,2,1>: Cost 2 vrev <3,u,1,2>
-  1544783464U,	// <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544783526U,	// <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1529720118U,	// <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
-  2618525544U,	// <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544783802U,	// <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2704181620U,	// <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
-  1544783931U,	// <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1544784022U,	// <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  1487922559U,	// <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
-  1493895256U,	// <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
-  336380006U,	// <u,3,3,3>: Cost 1 vdup3 LHS
-  1544784386U,	// <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2824054478U,	// <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2238286668U,	// <u,3,3,6>: Cost 3 vrev <3,u,6,3>
-  2954069136U,	// <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  336380006U,	// <u,3,3,u>: Cost 1 vdup3 LHS
-  1487929446U,	// <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
-  1487930752U,	// <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
-  2623171644U,	// <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2561673366U,	// <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
-  1487932726U,	// <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
-  471043382U,	// <u,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592561012U,	// <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2238368598U,	// <u,3,4,7>: Cost 3 vrev <3,u,7,4>
-  471043625U,	// <u,3,4,u>: Cost 1 vext2 LHS, RHS
-  2555707494U,	// <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
-  1574645465U,	// <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
-  2567653106U,	// <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
-  2555709954U,	// <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
-  1592561606U,	// <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592561668U,	// <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592561762U,	// <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1750314294U,	// <u,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1750314295U,	// <u,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2623172897U,	// <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2561688962U,	// <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
-  1581281795U,	// <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
-  2706541204U,	// <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
-  2623173261U,	// <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  1164495686U,	// <u,3,6,5>: Cost 2 vrev <3,u,5,6>
-  1592562488U,	// <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592562510U,	// <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1164716897U,	// <u,3,6,u>: Cost 2 vrev <3,u,u,6>
-  1487954022U,	// <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
-  1487955331U,	// <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
-  1493928028U,	// <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
-  2561697942U,	// <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
-  1487957302U,	// <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
-  2707352311U,	// <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
-  2655024623U,	// <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
-  1592563308U,	// <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1487959854U,	// <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
-  1544787667U,	// <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  471045934U,	// <u,3,u,1>: Cost 1 vext2 LHS, LHS
-  1549432709U,	// <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  336380006U,	// <u,3,u,3>: Cost 1 vdup3 LHS
-  1544788031U,	// <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  471046298U,	// <u,3,u,5>: Cost 1 vext2 LHS, RHS
-  1549433040U,	// <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1750314537U,	// <u,3,u,7>: Cost 2 vuzpr LHS, RHS
-  471046501U,	// <u,3,u,u>: Cost 1 vext2 LHS, LHS
-  2625167360U,	// <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
-  1551425638U,	// <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  2619195630U,	// <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
-  2619343104U,	// <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2625167698U,	// <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
-  1638329234U,	// <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638329244U,	// <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787803556U,	// <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1551426205U,	// <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
-  2555748454U,	// <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
-  2625168180U,	// <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
-  1551426503U,	// <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
-  2625168344U,	// <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
-  2555751734U,	// <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
-  1860554038U,	// <u,4,1,5>: Cost 2 vzipl LHS, RHS
-  2689879022U,	// <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2592248852U,	// <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1555408301U,	// <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
-  2555756646U,	// <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
-  2625168943U,	// <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
-  2625169000U,	// <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
-  2619197134U,	// <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
-  2555759926U,	// <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
-  2712071222U,	// <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  1994771766U,	// <u,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U,	// <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1994771784U,	// <u,4,2,u>: Cost 2 vtrnl LHS, RHS
-  2625169558U,	// <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
-  2567709594U,	// <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
-  2567710817U,	// <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
-  2625169820U,	// <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
-  2625169922U,	// <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
-  2954069710U,	// <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2954068172U,	// <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3903849472U,	// <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
-  2954068174U,	// <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  1505919078U,	// <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
-  2567717831U,	// <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
-  2567719010U,	// <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
-  2570373542U,	// <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  161926454U,	// <u,4,4,4>: Cost 1 vdup0 RHS
-  1551428918U,	// <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  1638329572U,	// <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  2594927963U,	// <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U,	// <u,4,4,u>: Cost 1 vdup0 RHS
-  1493983334U,	// <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
-  2689879301U,	// <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1493985379U,	// <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
-  2567727254U,	// <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
-  1493986614U,	// <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
-  1863535926U,	// <u,4,5,5>: Cost 2 vzipl RHS, RHS
-  537750838U,	// <u,4,5,6>: Cost 1 vext3 LHS, RHS
-  2830110006U,	// <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537750856U,	// <u,4,5,u>: Cost 1 vext3 LHS, RHS
-  1482047590U,	// <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
-  2555790070U,	// <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
-  2555790952U,	// <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
-  2555791510U,	// <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
-  1482050870U,	// <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
-  2689879422U,	// <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  1997753654U,	// <u,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2712071562U,	// <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1482053422U,	// <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
-  2567741542U,	// <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
-  2567742362U,	// <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
-  2567743589U,	// <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
-  2573716286U,	// <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
-  2567744822U,	// <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
-  2712071624U,	// <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  96808489U,	// <u,4,7,6>: Cost 1 vrev RHS
-  2651715180U,	// <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
-  96955963U,	// <u,4,7,u>: Cost 1 vrev RHS
-  1482063974U,	// <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
-  1551431470U,	// <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  1494009958U,	// <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
-  2555807894U,	// <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
-  161926454U,	// <u,4,u,4>: Cost 1 vdup0 RHS
-  1551431834U,	// <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  537751081U,	// <u,4,u,6>: Cost 1 vext3 LHS, RHS
-  2830110249U,	// <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537751099U,	// <u,4,u,u>: Cost 1 vext3 LHS, RHS
-  2631811072U,	// <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
-  1558069350U,	// <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
-  2619203823U,	// <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
-  2619867456U,	// <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
-  1546273106U,	// <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2733010539U,	// <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2597622682U,	// <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
-  1176539396U,	// <u,5,0,7>: Cost 2 vrev <5,u,7,0>
-  1558069917U,	// <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
-  1505968230U,	// <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
-  2624512887U,	// <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
-  2631811990U,	// <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
-  2618541056U,	// <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
-  1505971510U,	// <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
-  2627167419U,	// <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
-  2579714554U,	// <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
-  1638330064U,	// <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638477529U,	// <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  2561802342U,	// <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
-  2561803264U,	// <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
-  2631149217U,	// <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
-  1558071026U,	// <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
-  2561805622U,	// <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
-  2714062607U,	// <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  2631813050U,	// <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
-  3092335926U,	// <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
-  1561389191U,	// <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
-  2561810534U,	// <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
-  2561811857U,	// <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
-  2631813474U,	// <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
-  2631813532U,	// <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
-  2619869698U,	// <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
-  3001847002U,	// <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
-  2954070530U,	// <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2018749750U,	// <u,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2018749751U,	// <u,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2573762662U,	// <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
-  2620017634U,	// <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  2573764338U,	// <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
-  2573765444U,	// <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
-  1570680053U,	// <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
-  1558072630U,	// <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
-  2645749143U,	// <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
-  1638330310U,	// <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1558072873U,	// <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
-  1506000998U,	// <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
-  2561827984U,	// <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
-  2579744360U,	// <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
-  2579744918U,	// <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
-  1506004278U,	// <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
-  229035318U,	// <u,5,5,5>: Cost 1 vdup1 RHS
-  2712072206U,	// <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638330392U,	// <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  229035318U,	// <u,5,5,u>: Cost 1 vdup1 RHS
-  1500037222U,	// <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
-  2561836436U,	// <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
-  2567809133U,	// <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
-  1500040006U,	// <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
-  1500040502U,	// <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
-  2714062935U,	// <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712072288U,	// <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
-  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
-  1488101478U,	// <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488102805U,	// <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
-  2561844840U,	// <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
-  2561845398U,	// <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
-  1488104758U,	// <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
-  1638330536U,	// <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712072362U,	// <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2042965302U,	// <u,5,7,7>: Cost 2 vtrnr RHS, RHS
-  1488107310U,	// <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488109670U,	// <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
-  1488110998U,	// <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
-  2561853032U,	// <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
-  1500056392U,	// <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
-  1488112950U,	// <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
-  229035318U,	// <u,5,u,5>: Cost 1 vdup1 RHS
-  2954111490U,	// <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
-  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
-  2619211776U,	// <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
-  1545470054U,	// <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1545470192U,	// <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
-  2255958969U,	// <u,6,0,3>: Cost 3 vrev <6,u,3,0>
-  1546797458U,	// <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
-  2720624971U,	// <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
-  2256180180U,	// <u,6,0,6>: Cost 3 vrev <6,u,6,0>
-  2960682294U,	// <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
-  1545470621U,	// <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
-  1182004127U,	// <u,6,1,0>: Cost 2 vrev <6,u,0,1>
-  2619212596U,	// <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
-  2619212694U,	// <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
-  2619212760U,	// <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
-  2626511979U,	// <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
-  2619212944U,	// <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
-  2714063264U,	// <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2967326006U,	// <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
-  1182594023U,	// <u,6,1,u>: Cost 2 vrev <6,u,u,1>
-  1506050150U,	// <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
-  2579792630U,	// <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
-  2619213416U,	// <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
-  2619213478U,	// <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
-  1506053430U,	// <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
-  2633148309U,	// <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
-  2619213754U,	// <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
-  1638330874U,	// <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638478339U,	// <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2619213974U,	// <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
-  2255836074U,	// <u,6,3,1>: Cost 3 vrev <6,u,1,3>
-  2255909811U,	// <u,6,3,2>: Cost 3 vrev <6,u,2,3>
-  2619214236U,	// <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
-  1564715549U,	// <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
-  2639121006U,	// <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
-  3001847012U,	// <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1880329526U,	// <u,6,3,7>: Cost 2 vzipr LHS, RHS
-  1880329527U,	// <u,6,3,u>: Cost 2 vzipr LHS, RHS
-  2567864422U,	// <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
-  2733011558U,	// <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
-  2567866484U,	// <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
-  2638458005U,	// <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
-  1570540772U,	// <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1545473334U,	// <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  1572015512U,	// <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
-  2960715062U,	// <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
-  1545473577U,	// <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
-  2567872614U,	// <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
-  2645757648U,	// <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
-  2567874490U,	// <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
-  2576501250U,	// <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  1576660943U,	// <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
-  2645757956U,	// <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
-  2645758050U,	// <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
-  2824080694U,	// <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
-  1182626795U,	// <u,6,5,u>: Cost 2 vrev <6,u,u,5>
-  1506082918U,	// <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
-  2579825398U,	// <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
-  2645758458U,	// <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
-  2579826838U,	// <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
-  1506086198U,	// <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
-  2579828432U,	// <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
-  296144182U,	// <u,6,6,6>: Cost 1 vdup2 RHS
-  1638331202U,	// <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  296144182U,	// <u,6,6,u>: Cost 1 vdup2 RHS
-  432349286U,	// <u,6,7,0>: Cost 1 vext1 RHS, LHS
-  1506091766U,	// <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1506092648U,	// <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506093206U,	// <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432352809U,	// <u,6,7,4>: Cost 1 vext1 RHS, RHS
-  1506094800U,	// <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  1506095610U,	// <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1906904374U,	// <u,6,7,7>: Cost 2 vzipr RHS, RHS
-  432355118U,	// <u,6,7,u>: Cost 1 vext1 RHS, LHS
-  432357478U,	// <u,6,u,0>: Cost 1 vext1 RHS, LHS
-  1545475886U,	// <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1506100840U,	// <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506101398U,	// <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432361002U,	// <u,6,u,4>: Cost 1 vext1 RHS, RHS
-  1545476250U,	// <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  296144182U,	// <u,6,u,6>: Cost 1 vdup2 RHS
-  1880370486U,	// <u,6,u,7>: Cost 2 vzipr LHS, RHS
-  432363310U,	// <u,6,u,u>: Cost 1 vext1 RHS, LHS
-  1571356672U,	// <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497614950U,	// <u,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571356836U,	// <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2573880146U,	// <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
-  1571357010U,	// <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1512083716U,	// <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
-  2621874741U,	// <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
-  2585826298U,	// <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
-  497615517U,	// <u,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571357430U,	// <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571357492U,	// <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571357590U,	// <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1552114715U,	// <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
-  2573888822U,	// <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
-  1553441981U,	// <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
-  2627847438U,	// <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
-  2727408775U,	// <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
-  1555432880U,	// <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
-  2629838337U,	// <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
-  1188058754U,	// <u,7,2,1>: Cost 2 vrev <7,u,1,2>
-  1571358312U,	// <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571358374U,	// <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2632492869U,	// <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
-  2633156502U,	// <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
-  1560078311U,	// <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
-  2728072408U,	// <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
-  1561405577U,	// <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
-  1571358870U,	// <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2627184913U,	// <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
-  2633820523U,	// <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
-  1571359132U,	// <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571359234U,	// <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  1512108295U,	// <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
-  1518080992U,	// <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
-  2640456465U,	// <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
-  1571359518U,	// <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571359634U,	// <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2573911067U,	// <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
-  2645101622U,	// <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2573912918U,	// <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
-  1571359952U,	// <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497618248U,	// <u,7,4,5>: Cost 1 vext2 RHS, RHS
-  1571360116U,	// <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2645102024U,	// <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
-  497618473U,	// <u,7,4,u>: Cost 1 vext2 RHS, RHS
-  2645102152U,	// <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1571360464U,	// <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2645102334U,	// <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2645102447U,	// <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571360710U,	// <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571360772U,	// <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571360866U,	// <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1571360936U,	// <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1571361017U,	// <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  1530044518U,	// <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
-  2645103016U,	// <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571361274U,	// <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2645103154U,	// <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  1530047798U,	// <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
-  1188386474U,	// <u,7,6,5>: Cost 2 vrev <7,u,5,6>
-  1571361592U,	// <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571361614U,	// <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1571361695U,	// <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1571361786U,	// <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2573935616U,	// <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
-  2645103781U,	// <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
-  2573937497U,	// <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
-  1571362150U,	// <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  1512141067U,	// <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
-  1518113764U,	// <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
-  363253046U,	// <u,7,7,7>: Cost 1 vdup3 RHS
-  363253046U,	// <u,7,7,u>: Cost 1 vdup3 RHS
-  1571362515U,	// <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497620782U,	// <u,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571362693U,	// <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571362748U,	// <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571362879U,	// <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497621146U,	// <u,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571363024U,	// <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  363253046U,	// <u,7,u,7>: Cost 1 vdup3 RHS
-  497621349U,	// <u,7,u,u>: Cost 1 vext2 RHS, LHS
-  135053414U,	// <u,u,0,0>: Cost 1 vdup0 LHS
-  471081121U,	// <u,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544822948U,	// <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1616140005U,	// <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  1544823122U,	// <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  1512157453U,	// <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
-  1662220032U,	// <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  1194457487U,	// <u,u,0,7>: Cost 2 vrev <u,u,7,0>
-  471081629U,	// <u,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544823542U,	// <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  202162278U,	// <u,u,1,1>: Cost 1 vdup1 LHS
-  537753390U,	// <u,u,1,2>: Cost 1 vext3 LHS, LHS
-  1544823768U,	// <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  1494248758U,	// <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
-  1544823952U,	// <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  1518138343U,	// <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
-  1640322907U,	// <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  537753444U,	// <u,u,1,u>: Cost 1 vext3 LHS, LHS
-  1482309734U,	// <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
-  1194031451U,	// <u,u,2,1>: Cost 2 vrev <u,u,1,2>
-  269271142U,	// <u,u,2,2>: Cost 1 vdup2 LHS
-  835584U,	// <u,u,2,3>: Cost 0 copy LHS
-  1482313014U,	// <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
-  2618566504U,	// <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544824762U,	// <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  1638479788U,	// <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  835584U,	// <u,u,2,u>: Cost 0 copy LHS
-  408576723U,	// <u,u,3,0>: Cost 1 vext1 LHS, LHS
-  1482318582U,	// <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  120371557U,	// <u,u,3,2>: Cost 1 vrev LHS
-  336380006U,	// <u,u,3,3>: Cost 1 vdup3 LHS
-  408579382U,	// <u,u,3,4>: Cost 1 vext1 LHS, RHS
-  1616140271U,	// <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  1530098170U,	// <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1880329544U,	// <u,u,3,7>: Cost 2 vzipr LHS, RHS
-  408581934U,	// <u,u,3,u>: Cost 1 vext1 LHS, LHS
-  1488298086U,	// <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
-  1488299437U,	// <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
-  1659271204U,	// <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  1194195311U,	// <u,u,4,3>: Cost 2 vrev <u,u,3,4>
-  161926454U,	// <u,u,4,4>: Cost 1 vdup0 RHS
-  471084342U,	// <u,u,4,5>: Cost 1 vext2 LHS, RHS
-  1571368308U,	// <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1640323153U,	// <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  471084585U,	// <u,u,4,u>: Cost 1 vext2 LHS, RHS
-  1494278246U,	// <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
-  1571368656U,	// <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  1494280327U,	// <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
-  1616140415U,	// <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1494281526U,	// <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
-  229035318U,	// <u,u,5,5>: Cost 1 vdup1 RHS
-  537753754U,	// <u,u,5,6>: Cost 1 vext3 LHS, RHS
-  1750355254U,	// <u,u,5,7>: Cost 2 vuzpr LHS, RHS
-  537753772U,	// <u,u,5,u>: Cost 1 vext3 LHS, RHS
-  1482342502U,	// <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
-  2556084982U,	// <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
-  1571369466U,	// <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  1611938000U,	// <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1482345782U,	// <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
-  1194359171U,	// <u,u,6,5>: Cost 2 vrev <u,u,5,6>
-  296144182U,	// <u,u,6,6>: Cost 1 vdup2 RHS
-  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
-  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
-  432496742U,	// <u,u,7,0>: Cost 1 vext1 RHS, LHS
-  1488324016U,	// <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
-  1494296713U,	// <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
-  1906901148U,	// <u,u,7,3>: Cost 2 vzipr RHS, LHS
-  432500283U,	// <u,u,7,4>: Cost 1 vext1 RHS, RHS
-  1506242256U,	// <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  120699277U,	// <u,u,7,6>: Cost 1 vrev RHS
-  363253046U,	// <u,u,7,7>: Cost 1 vdup3 RHS
-  432502574U,	// <u,u,7,u>: Cost 1 vext1 RHS, LHS
-  408617688U,	// <u,u,u,0>: Cost 1 vext1 LHS, LHS
-  471086894U,	// <u,u,u,1>: Cost 1 vext2 LHS, LHS
-  537753957U,	// <u,u,u,2>: Cost 1 vext3 LHS, LHS
-  835584U,	// <u,u,u,3>: Cost 0 copy LHS
-  408620342U,	// <u,u,u,4>: Cost 1 vext1 LHS, RHS
-  471087258U,	// <u,u,u,5>: Cost 1 vext2 LHS, RHS
-  537753997U,	// <u,u,u,6>: Cost 1 vext3 LHS, RHS
-  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
-  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+   135053414U,  // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U,  // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U,  // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U,  // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U,  // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U,  // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U,  // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U,  // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+   135053414U,  // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U,  // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U,  // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U,  // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U,  // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U,  // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U,  // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U,  // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U,  // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U,  // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U,  // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U,  // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U,  // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U,  // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U,  // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U,  // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U,  // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U,  // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U,  // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U,  // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U,  // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U,  // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U,  // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U,  // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U,  // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U,  // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U,  // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U,  // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U,  // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U,  // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U,  // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U,  // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U,  // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U,  // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U,  // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U,  // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U,  // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U,  // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U,  // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U,  // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U,  // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U,  // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U,  // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U,  // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U,  // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U,  // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U,  // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U,  // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U,  // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U,  // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U,  // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U,  // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U,  // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U,  // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U,  // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U,  // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U,  // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U,  // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U,  // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U,  // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U,  // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U,  // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U,  // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U,  // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+   135053414U,  // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U,  // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U,  // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U,  // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U,  // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U,  // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U,  // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U,  // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+   135053414U,  // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U,  // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U,  // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U,  // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U,  // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U,  // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U,  // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U,  // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U,  // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U,  // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U,  // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U,  // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U,  // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U,  // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U,  // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U,  // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U,  // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U,  // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U,  // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U,  // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U,  // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U,  // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+      835584U,  // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U,  // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U,  // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U,  // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U,  // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+      835584U,  // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U,  // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U,  // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U,  // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U,  // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U,  // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U,  // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U,  // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U,  // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U,  // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U,  // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U,  // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U,  // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U,  // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U,  // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U,  // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U,  // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U,  // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U,  // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U,  // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U,  // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U,  // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U,  // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U,  // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U,  // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U,  // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U,  // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U,  // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U,  // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U,  // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U,  // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U,  // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U,  // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U,  // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U,  // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U,  // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U,  // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U,  // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U,  // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U,  // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U,  // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U,  // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U,  // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U,  // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U,  // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U,  // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U,  // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U,  // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U,  // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+      835584U,  // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U,  // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U,  // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U,  // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U,  // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+      835584U,  // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U,  // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U,  // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U,  // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U,  // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U,  // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U,  // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U,  // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U,  // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U,  // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U,  // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U,  // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U,  // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U,  // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U,  // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U,  // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U,  // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U,  // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U,  // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U,  // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U,  // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U,  // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U,  // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U,  // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U,  // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U,  // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U,  // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U,  // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U,  // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U,  // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U,  // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U,  // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U,  // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U,  // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U,  // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U,  // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U,  // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U,  // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U,  // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U,  // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U,  // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U,  // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U,  // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U,  // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U,  // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U,  // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U,  // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U,  // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U,  // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U,  // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U,  // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U,  // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U,  // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U,  // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U,  // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U,  // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U,  // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U,  // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U,  // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U,  // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U,  // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U,  // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U,  // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U,  // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U,  // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U,  // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U,  // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U,  // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U,  // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U,  // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U,  // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U,  // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U,  // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U,  // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U,  // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U,  // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U,  // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U,  // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U,  // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U,  // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U,  // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U,  // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U,  // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U,  // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U,  // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U,  // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U,  // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U,  // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U,  // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U,  // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U,  // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U,  // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U,  // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U,  // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U,  // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U,  // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U,  // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U,  // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U,  // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U,  // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U,  // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U,  // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U,  // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U,  // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U,  // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U,  // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U,  // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U,  // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U,  // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U,  // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U,  // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U,  // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U,  // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U,  // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U,  // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U,  // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U,  // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U,  // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U,  // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U,  // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U,  // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U,  // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U,  // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U,  // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U,  // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U,  // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U,  // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U,  // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U,  // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U,  // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U,  // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U,  // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U,  // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U,  // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U,  // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U,  // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U,  // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U,  // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U,  // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U,  // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U,  // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U,  // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U,  // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U,  // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U,  // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U,  // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U,  // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U,  // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U,  // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U,  // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U,  // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U,  // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U,  // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U,  // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U,  // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U,  // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U,  // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U,  // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U,  // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U,  // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U,  // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U,  // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U,  // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U,  // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U,  // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U,  // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U,  // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U,  // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U,  // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U,  // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U,  // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U,  // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U,  // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U,  // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U,  // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U,  // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U,  // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U,  // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U,  // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U,  // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U,  // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U,  // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U,  // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U,  // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U,  // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U,  // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U,  // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U,  // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U,  // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U,  // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U,  // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U,  // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U,  // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U,  // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U,  // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U,  // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U,  // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U,  // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U,  // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U,  // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U,  // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U,  // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U,  // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U,  // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U,  // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U,  // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U,  // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U,  // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U,  // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U,  // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U,  // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U,  // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U,  // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U,  // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U,  // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U,  // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U,  // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U,  // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U,  // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U,  // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U,  // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U,  // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U,  // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U,  // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U,  // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U,  // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U,  // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U,  // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U,  // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U,  // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U,  // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U,  // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U,  // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U,  // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U,  // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U,  // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U,  // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U,  // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U,  // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U,  // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U,  // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U,  // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U,  // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U,  // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U,  // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U,  // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U,  // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U,  // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U,  // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U,  // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U,  // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U,  // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U,  // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U,  // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U,  // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U,  // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U,  // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U,  // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U,  // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U,  // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U,  // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U,  // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U,  // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U,  // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U,  // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U,  // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U,  // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U,  // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U,  // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U,  // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U,  // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U,  // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U,  // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U,  // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U,  // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U,  // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U,  // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U,  // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U,  // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U,  // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U,  // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U,  // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U,  // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U,  // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U,  // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U,  // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U,  // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U,  // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U,  // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U,  // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U,  // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U,  // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U,  // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U,  // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U,  // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U,  // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U,  // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U,  // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U,  // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U,  // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U,  // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U,  // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U,  // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U,  // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U,  // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U,  // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U,  // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U,  // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U,  // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U,  // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U,  // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U,  // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U,  // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U,  // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U,  // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U,  // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U,  // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U,  // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U,  // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U,  // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U,  // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U,  // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U,  // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U,  // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U,  // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U,  // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U,  // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U,  // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U,  // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U,  // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U,  // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U,  // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U,  // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U,  // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U,  // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U,  // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U,  // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U,  // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U,  // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U,  // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U,  // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U,  // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U,  // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U,  // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U,  // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U,  // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U,  // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U,  // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U,  // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U,  // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U,  // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U,  // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U,  // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U,  // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U,  // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U,  // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U,  // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U,  // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U,  // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U,  // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U,  // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U,  // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U,  // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U,  // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U,  // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U,  // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U,  // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U,  // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U,  // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U,  // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U,  // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U,  // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U,  // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U,  // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U,  // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U,  // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U,  // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U,  // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U,  // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U,  // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U,  // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U,  // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U,  // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U,  // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U,  // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U,  // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U,  // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U,  // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U,  // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U,  // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U,  // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U,  // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U,  // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U,  // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U,  // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U,  // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U,  // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U,  // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U,  // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U,  // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U,  // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U,  // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U,  // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U,  // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U,  // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U,  // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U,  // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U,  // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U,  // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U,  // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U,  // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U,  // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U,  // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U,  // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U,  // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U,  // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U,  // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U,  // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U,  // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U,  // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U,  // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U,  // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U,  // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U,  // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U,  // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U,  // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U,  // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U,  // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U,  // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U,  // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U,  // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U,  // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U,  // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U,  // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U,  // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U,  // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U,  // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U,  // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U,  // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U,  // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U,  // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U,  // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U,  // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U,  // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U,  // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U,  // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U,  // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U,  // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U,  // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U,  // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U,  // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U,  // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U,  // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U,  // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U,  // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U,  // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U,  // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U,  // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U,  // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U,  // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U,  // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U,  // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U,  // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U,  // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U,  // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U,  // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U,  // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U,  // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U,  // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U,  // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U,  // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U,  // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U,  // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U,  // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U,  // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U,  // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U,  // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U,  // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U,  // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U,  // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U,  // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U,  // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U,  // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U,  // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U,  // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U,  // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U,  // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+   135053414U,  // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U,  // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U,  // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U,  // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U,  // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U,  // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U,  // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U,  // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+   135053414U,  // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U,  // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U,  // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U,  // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U,  // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U,  // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U,  // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U,  // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U,  // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U,  // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U,  // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U,  // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U,  // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+      835584U,  // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U,  // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U,  // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U,  // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U,  // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+      835584U,  // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U,  // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U,  // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U,  // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U,  // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U,  // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U,  // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U,  // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U,  // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U,  // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U,  // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U,  // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U,  // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U,  // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U,  // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U,  // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U,  // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U,  // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U,  // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U,  // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U,  // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U,  // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U,  // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U,  // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U,  // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U,  // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U,  // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U,  // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U,  // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U,  // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U,  // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U,  // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U,  // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U,  // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U,  // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U,  // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U,  // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U,  // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U,  // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U,  // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U,  // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U,  // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U,  // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U,  // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U,  // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U,  // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+   135053414U,  // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U,  // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U,  // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+      835584U,  // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U,  // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U,  // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U,  // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U,  // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+      835584U,  // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U,  // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U,  // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U,  // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U,  // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U,  // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U,  // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U,  // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U,  // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U,  // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U,  // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U,  // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U,  // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U,  // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U,  // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U,  // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U,  // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U,  // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U,  // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U,  // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U,  // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U,  // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U,  // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U,  // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U,  // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U,  // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U,  // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U,  // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U,  // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U,  // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+    67944550U,  // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U,  // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U,  // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U,  // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U,  // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U,  // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+    68386972U,  // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U,  // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U,  // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U,  // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U,  // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U,  // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U,  // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U,  // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U,  // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U,  // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U,  // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U,  // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U,  // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U,  // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U,  // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U,  // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U,  // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U,  // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U,  // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U,  // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U,  // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U,  // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U,  // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U,  // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U,  // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U,  // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U,  // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U,  // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U,  // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U,  // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U,  // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U,  // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U,  // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U,  // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U,  // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U,  // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U,  // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U,  // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U,  // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+    67985515U,  // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U,  // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U,  // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U,  // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U,  // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U,  // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+    68427937U,  // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U,  // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U,  // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U,  // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U,  // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U,  // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U,  // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U,  // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U,  // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U,  // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U,  // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+   202162278U,  // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U,  // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U,  // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U,  // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U,  // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U,  // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U,  // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+   202162278U,  // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U,  // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U,  // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U,  // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U,  // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U,  // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U,  // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U,  // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U,  // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U,  // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U,  // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U,  // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U,  // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U,  // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U,  // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U,  // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U,  // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U,  // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U,  // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U,  // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U,  // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U,  // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U,  // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U,  // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U,  // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U,  // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U,  // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U,  // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U,  // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U,  // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U,  // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U,  // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U,  // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U,  // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U,  // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U,  // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U,  // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U,  // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U,  // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U,  // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U,  // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U,  // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U,  // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U,  // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U,  // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U,  // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U,  // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U,  // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U,  // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U,  // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U,  // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U,  // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U,  // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U,  // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U,  // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U,  // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+   202162278U,  // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U,  // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U,  // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U,  // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U,  // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U,  // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U,  // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+   202162278U,  // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U,  // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U,  // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U,  // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U,  // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U,  // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U,  // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U,  // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U,  // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U,  // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U,  // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U,  // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U,  // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U,  // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U,  // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U,  // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U,  // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U,  // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U,  // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U,  // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U,  // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U,  // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U,  // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U,  // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U,  // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U,  // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U,  // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U,  // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+   403488870U,  // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U,  // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U,  // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U,  // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+   403492150U,  // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U,  // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U,  // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U,  // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+   403494702U,  // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U,  // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U,  // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U,  // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U,  // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U,  // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U,  // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U,  // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U,  // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U,  // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U,  // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U,  // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U,  // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U,  // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U,  // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U,  // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U,  // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U,  // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U,  // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U,  // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U,  // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U,  // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U,  // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U,  // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U,  // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U,  // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U,  // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U,  // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U,  // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U,  // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U,  // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U,  // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U,  // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U,  // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U,  // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U,  // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U,  // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+   403529835U,  // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U,  // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U,  // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U,  // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+   403533110U,  // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U,  // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U,  // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U,  // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+   403535662U,  // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U,  // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U,  // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U,  // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U,  // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U,  // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U,  // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U,  // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U,  // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U,  // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U,  // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U,  // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U,  // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U,  // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U,  // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U,  // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U,  // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U,  // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U,  // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U,  // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U,  // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U,  // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U,  // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U,  // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U,  // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U,  // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U,  // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U,  // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U,  // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U,  // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U,  // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U,  // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U,  // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U,  // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U,  // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U,  // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U,  // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U,  // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U,  // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U,  // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U,  // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U,  // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U,  // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U,  // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U,  // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U,  // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U,  // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U,  // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U,  // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U,  // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U,  // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U,  // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U,  // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U,  // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U,  // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U,  // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U,  // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U,  // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U,  // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U,  // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U,  // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U,  // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U,  // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U,  // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U,  // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U,  // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U,  // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U,  // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U,  // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U,  // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U,  // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U,  // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U,  // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U,  // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U,  // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U,  // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U,  // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U,  // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U,  // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U,  // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U,  // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U,  // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U,  // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U,  // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U,  // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U,  // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U,  // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U,  // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U,  // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U,  // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U,  // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U,  // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U,  // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U,  // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U,  // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U,  // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U,  // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U,  // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U,  // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U,  // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U,  // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U,  // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U,  // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U,  // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U,  // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U,  // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U,  // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U,  // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U,  // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U,  // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U,  // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U,  // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U,  // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U,  // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U,  // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U,  // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U,  // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U,  // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U,  // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U,  // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U,  // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U,  // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U,  // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U,  // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U,  // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U,  // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U,  // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U,  // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U,  // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U,  // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U,  // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U,  // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U,  // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U,  // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U,  // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U,  // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U,  // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U,  // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U,  // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U,  // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U,  // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U,  // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U,  // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U,  // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U,  // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U,  // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U,  // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U,  // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U,  // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U,  // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U,  // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U,  // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U,  // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U,  // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U,  // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U,  // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U,  // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U,  // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U,  // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U,  // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U,  // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U,  // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U,  // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U,  // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U,  // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U,  // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U,  // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U,  // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U,  // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U,  // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U,  // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U,  // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U,  // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U,  // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U,  // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U,  // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U,  // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U,  // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U,  // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U,  // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U,  // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U,  // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U,  // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U,  // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U,  // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U,  // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U,  // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U,  // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U,  // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U,  // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U,  // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U,  // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U,  // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U,  // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U,  // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U,  // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U,  // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U,  // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U,  // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U,  // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U,  // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U,  // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U,  // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U,  // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U,  // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U,  // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U,  // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U,  // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U,  // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U,  // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U,  // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U,  // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U,  // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U,  // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U,  // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U,  // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U,  // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U,  // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U,  // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U,  // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U,  // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U,  // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U,  // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U,  // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U,  // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U,  // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U,  // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U,  // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U,  // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U,  // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U,  // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U,  // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U,  // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U,  // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U,  // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U,  // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U,  // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U,  // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U,  // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U,  // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U,  // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U,  // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U,  // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U,  // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U,  // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U,  // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U,  // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U,  // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U,  // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U,  // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U,  // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U,  // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U,  // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U,  // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U,  // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U,  // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U,  // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U,  // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U,  // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U,  // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U,  // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U,  // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U,  // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U,  // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U,  // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U,  // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U,  // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U,  // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U,  // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U,  // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U,  // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U,  // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U,  // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U,  // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U,  // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U,  // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U,  // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U,  // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U,  // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U,  // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U,  // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U,  // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U,  // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U,  // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U,  // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U,  // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U,  // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U,  // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U,  // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U,  // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U,  // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U,  // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U,  // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U,  // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U,  // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U,  // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U,  // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U,  // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U,  // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U,  // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U,  // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U,  // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U,  // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U,  // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U,  // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U,  // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U,  // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U,  // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U,  // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U,  // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U,  // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U,  // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U,  // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U,  // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U,  // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U,  // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U,  // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U,  // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U,  // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U,  // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U,  // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U,  // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U,  // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U,  // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U,  // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U,  // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U,  // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U,  // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U,  // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U,  // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U,  // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U,  // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U,  // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U,  // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U,  // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U,  // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U,  // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U,  // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U,  // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U,  // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U,  // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U,  // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U,  // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U,  // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U,  // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U,  // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U,  // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U,  // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U,  // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U,  // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U,  // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U,  // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U,  // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U,  // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U,  // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U,  // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U,  // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U,  // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U,  // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U,  // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U,  // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U,  // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U,  // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U,  // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U,  // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U,  // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U,  // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U,  // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U,  // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U,  // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U,  // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U,  // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U,  // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U,  // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U,  // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U,  // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U,  // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U,  // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U,  // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U,  // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U,  // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U,  // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U,  // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U,  // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U,  // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U,  // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U,  // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U,  // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U,  // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U,  // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U,  // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U,  // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U,  // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U,  // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U,  // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U,  // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U,  // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U,  // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U,  // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U,  // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U,  // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U,  // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U,  // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U,  // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U,  // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U,  // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U,  // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U,  // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U,  // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U,  // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U,  // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U,  // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U,  // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U,  // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U,  // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U,  // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+   202162278U,  // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U,  // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U,  // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U,  // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U,  // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U,  // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U,  // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+   202162278U,  // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U,  // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U,  // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U,  // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U,  // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U,  // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U,  // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U,  // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U,  // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U,  // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+   403931292U,  // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U,  // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+   115726126U,  // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U,  // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+   403934518U,  // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U,  // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U,  // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U,  // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+   403937070U,  // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U,  // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U,  // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U,  // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U,  // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U,  // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U,  // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U,  // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U,  // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U,  // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U,  // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U,  // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U,  // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U,  // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U,  // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U,  // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U,  // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U,  // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U,  // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U,  // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U,  // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U,  // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U,  // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U,  // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U,  // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U,  // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U,  // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U,  // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U,  // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U,  // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U,  // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U,  // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U,  // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U,  // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U,  // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U,  // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U,  // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+   403972257U,  // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+   202162278U,  // <1,u,u,1>: Cost 1 vdup1 LHS
+   115767091U,  // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U,  // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+   403975478U,  // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U,  // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U,  // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U,  // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+   403978030U,  // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U,  // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U,  // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U,  // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U,  // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U,  // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U,  // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U,  // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U,  // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U,  // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U,  // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U,  // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U,  // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U,  // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U,  // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U,  // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U,  // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U,  // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U,  // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U,  // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U,  // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U,  // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U,  // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U,  // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U,  // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U,  // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U,  // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U,  // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U,  // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U,  // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U,  // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U,  // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U,  // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U,  // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U,  // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U,  // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U,  // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U,  // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U,  // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U,  // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U,  // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U,  // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U,  // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U,  // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U,  // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U,  // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U,  // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U,  // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U,  // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U,  // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U,  // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U,  // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U,  // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U,  // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U,  // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U,  // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U,  // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U,  // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U,  // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U,  // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U,  // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U,  // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U,  // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U,  // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U,  // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U,  // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U,  // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U,  // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U,  // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U,  // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U,  // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U,  // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U,  // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U,  // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U,  // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U,  // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U,  // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U,  // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U,  // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U,  // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U,  // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U,  // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U,  // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U,  // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U,  // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U,  // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U,  // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U,  // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U,  // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U,  // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U,  // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U,  // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U,  // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U,  // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U,  // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U,  // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U,  // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U,  // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U,  // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U,  // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U,  // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U,  // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U,  // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U,  // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U,  // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U,  // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U,  // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U,  // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U,  // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U,  // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U,  // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U,  // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U,  // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U,  // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U,  // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U,  // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U,  // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U,  // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U,  // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U,  // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U,  // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U,  // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U,  // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U,  // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U,  // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U,  // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U,  // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U,  // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U,  // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U,  // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U,  // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U,  // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U,  // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U,  // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U,  // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U,  // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U,  // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U,  // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U,  // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U,  // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U,  // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U,  // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U,  // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U,  // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U,  // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U,  // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U,  // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U,  // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U,  // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U,  // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U,  // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U,  // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U,  // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U,  // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U,  // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U,  // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U,  // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U,  // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U,  // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U,  // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U,  // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U,  // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U,  // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U,  // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U,  // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U,  // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U,  // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U,  // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U,  // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U,  // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U,  // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U,  // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U,  // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U,  // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U,  // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U,  // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U,  // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U,  // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U,  // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U,  // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U,  // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U,  // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U,  // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+   269271142U,  // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U,  // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U,  // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U,  // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U,  // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U,  // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+   269271142U,  // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U,  // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U,  // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U,  // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U,  // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U,  // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U,  // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U,  // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U,  // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U,  // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U,  // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U,  // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U,  // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U,  // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U,  // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U,  // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U,  // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U,  // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U,  // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U,  // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U,  // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U,  // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U,  // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U,  // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U,  // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U,  // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U,  // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U,  // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U,  // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U,  // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U,  // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U,  // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U,  // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U,  // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U,  // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U,  // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U,  // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U,  // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U,  // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U,  // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U,  // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U,  // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U,  // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U,  // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U,  // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U,  // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U,  // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U,  // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+   269271142U,  // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U,  // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U,  // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U,  // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U,  // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U,  // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+   269271142U,  // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U,  // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+   470597734U,  // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U,  // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U,  // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U,  // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U,  // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U,  // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U,  // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+   470598301U,  // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U,  // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U,  // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U,  // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U,  // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U,  // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U,  // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U,  // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U,  // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U,  // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U,  // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U,  // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U,  // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U,  // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U,  // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U,  // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U,  // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U,  // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U,  // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U,  // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U,  // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U,  // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U,  // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U,  // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U,  // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U,  // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U,  // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U,  // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U,  // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U,  // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U,  // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U,  // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U,  // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+   470601014U,  // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U,  // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U,  // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+   470601257U,  // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U,  // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U,  // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U,  // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U,  // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U,  // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U,  // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U,  // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U,  // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U,  // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U,  // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U,  // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U,  // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U,  // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U,  // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U,  // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U,  // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U,  // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U,  // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U,  // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U,  // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U,  // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U,  // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U,  // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U,  // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U,  // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U,  // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U,  // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U,  // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+   470603566U,  // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U,  // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U,  // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U,  // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+   470603930U,  // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U,  // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U,  // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+   470604133U,  // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U,  // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U,  // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U,  // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U,  // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U,  // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U,  // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U,  // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U,  // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U,  // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U,  // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U,  // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U,  // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U,  // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U,  // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U,  // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U,  // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U,  // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U,  // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U,  // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U,  // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U,  // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U,  // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U,  // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U,  // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U,  // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U,  // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U,  // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U,  // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U,  // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U,  // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U,  // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U,  // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U,  // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U,  // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U,  // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U,  // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U,  // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U,  // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U,  // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U,  // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U,  // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U,  // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U,  // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U,  // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U,  // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U,  // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U,  // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U,  // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U,  // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U,  // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U,  // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U,  // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U,  // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U,  // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U,  // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U,  // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U,  // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U,  // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U,  // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U,  // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U,  // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U,  // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U,  // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U,  // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U,  // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U,  // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U,  // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U,  // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U,  // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U,  // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U,  // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U,  // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U,  // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U,  // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U,  // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U,  // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U,  // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U,  // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U,  // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U,  // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U,  // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U,  // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U,  // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U,  // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U,  // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U,  // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U,  // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U,  // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U,  // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U,  // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U,  // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U,  // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U,  // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U,  // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U,  // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U,  // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U,  // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U,  // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U,  // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U,  // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U,  // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U,  // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U,  // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U,  // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U,  // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U,  // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U,  // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U,  // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U,  // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U,  // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U,  // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U,  // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U,  // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U,  // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U,  // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U,  // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U,  // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U,  // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U,  // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U,  // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U,  // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U,  // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U,  // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U,  // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U,  // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U,  // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U,  // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U,  // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U,  // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U,  // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U,  // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U,  // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U,  // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U,  // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U,  // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U,  // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U,  // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U,  // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U,  // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U,  // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U,  // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U,  // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U,  // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U,  // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U,  // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U,  // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U,  // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U,  // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U,  // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U,  // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U,  // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U,  // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U,  // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U,  // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U,  // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U,  // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U,  // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U,  // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U,  // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U,  // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U,  // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U,  // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U,  // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U,  // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U,  // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U,  // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U,  // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U,  // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U,  // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U,  // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U,  // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U,  // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U,  // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U,  // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U,  // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U,  // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U,  // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U,  // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U,  // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U,  // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U,  // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U,  // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U,  // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U,  // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U,  // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U,  // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U,  // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U,  // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U,  // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U,  // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U,  // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U,  // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U,  // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U,  // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U,  // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U,  // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U,  // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U,  // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U,  // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U,  // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U,  // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U,  // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U,  // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U,  // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U,  // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U,  // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U,  // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U,  // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U,  // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U,  // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U,  // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U,  // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U,  // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U,  // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U,  // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U,  // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U,  // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U,  // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U,  // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U,  // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U,  // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U,  // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U,  // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U,  // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U,  // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U,  // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U,  // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U,  // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U,  // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U,  // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U,  // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U,  // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U,  // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U,  // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U,  // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U,  // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U,  // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U,  // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U,  // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U,  // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U,  // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U,  // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U,  // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U,  // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U,  // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U,  // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U,  // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U,  // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U,  // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U,  // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U,  // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U,  // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U,  // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U,  // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U,  // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U,  // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U,  // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U,  // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U,  // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U,  // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U,  // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U,  // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U,  // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U,  // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U,  // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U,  // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U,  // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U,  // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U,  // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U,  // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U,  // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U,  // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U,  // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U,  // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U,  // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U,  // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U,  // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U,  // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U,  // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U,  // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U,  // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U,  // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U,  // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U,  // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U,  // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U,  // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U,  // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U,  // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U,  // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U,  // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U,  // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U,  // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U,  // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U,  // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U,  // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U,  // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U,  // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U,  // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U,  // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U,  // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U,  // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U,  // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U,  // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U,  // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U,  // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U,  // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U,  // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U,  // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U,  // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U,  // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U,  // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U,  // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U,  // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U,  // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U,  // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U,  // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U,  // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U,  // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U,  // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U,  // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U,  // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U,  // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U,  // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U,  // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U,  // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+   470638699U,  // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U,  // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U,  // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U,  // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U,  // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U,  // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U,  // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+   470639261U,  // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U,  // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U,  // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U,  // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U,  // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U,  // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U,  // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U,  // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U,  // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U,  // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U,  // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U,  // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+   269271142U,  // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U,  // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U,  // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U,  // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U,  // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U,  // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+   269271142U,  // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U,  // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U,  // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U,  // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U,  // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U,  // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U,  // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U,  // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U,  // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U,  // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U,  // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U,  // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U,  // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U,  // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U,  // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+   470641974U,  // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U,  // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U,  // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+   470642217U,  // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U,  // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U,  // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U,  // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U,  // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U,  // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U,  // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U,  // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U,  // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U,  // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U,  // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U,  // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U,  // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U,  // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U,  // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U,  // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U,  // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U,  // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U,  // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U,  // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U,  // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U,  // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U,  // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U,  // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U,  // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U,  // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U,  // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U,  // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U,  // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+   470644526U,  // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+   269271142U,  // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U,  // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U,  // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+   470644890U,  // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U,  // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U,  // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+   470645093U,  // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U,  // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U,  // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U,  // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U,  // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U,  // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U,  // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U,  // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U,  // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U,  // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U,  // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U,  // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+   537706598U,  // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U,  // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U,  // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U,  // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U,  // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U,  // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+   537706652U,  // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U,  // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U,  // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U,  // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U,  // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U,  // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U,  // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U,  // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U,  // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U,  // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U,  // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U,  // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U,  // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U,  // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U,  // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U,  // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U,  // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U,  // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U,  // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U,  // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U,  // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U,  // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U,  // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U,  // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U,  // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U,  // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U,  // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U,  // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U,  // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U,  // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U,  // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U,  // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U,  // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U,  // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U,  // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U,  // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U,  // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U,  // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U,  // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U,  // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U,  // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U,  // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U,  // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U,  // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U,  // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U,  // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U,  // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U,  // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U,  // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U,  // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U,  // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U,  // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U,  // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U,  // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U,  // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U,  // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U,  // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+   537707165U,  // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U,  // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U,  // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U,  // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U,  // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U,  // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+   537707219U,  // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U,  // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U,  // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U,  // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U,  // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U,  // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U,  // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U,  // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U,  // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U,  // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U,  // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U,  // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U,  // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U,  // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U,  // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U,  // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U,  // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U,  // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U,  // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U,  // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U,  // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U,  // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U,  // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U,  // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U,  // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U,  // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U,  // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U,  // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U,  // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U,  // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U,  // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U,  // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U,  // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U,  // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U,  // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U,  // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U,  // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U,  // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U,  // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U,  // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U,  // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U,  // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U,  // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U,  // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U,  // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U,  // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U,  // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U,  // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U,  // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U,  // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U,  // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U,  // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U,  // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U,  // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U,  // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U,  // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U,  // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U,  // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U,  // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U,  // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U,  // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U,  // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U,  // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U,  // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U,  // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U,  // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U,  // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U,  // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U,  // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U,  // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U,  // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U,  // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U,  // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U,  // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U,  // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U,  // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U,  // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U,  // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U,  // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U,  // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U,  // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U,  // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U,  // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U,  // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U,  // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U,  // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U,  // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U,  // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U,  // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U,  // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U,  // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U,  // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U,  // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U,  // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U,  // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U,  // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U,  // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U,  // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U,  // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U,  // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U,  // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U,  // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U,  // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U,  // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U,  // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U,  // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U,  // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U,  // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U,  // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U,  // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U,  // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U,  // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U,  // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U,  // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U,  // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U,  // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U,  // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U,  // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U,  // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U,  // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U,  // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U,  // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U,  // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U,  // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U,  // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U,  // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U,  // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U,  // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U,  // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U,  // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U,  // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U,  // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U,  // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U,  // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U,  // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U,  // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U,  // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U,  // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U,  // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U,  // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U,  // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U,  // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U,  // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U,  // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U,  // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U,  // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U,  // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U,  // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U,  // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U,  // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U,  // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U,  // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U,  // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U,  // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U,  // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U,  // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U,  // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U,  // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U,  // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U,  // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U,  // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U,  // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U,  // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U,  // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U,  // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U,  // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U,  // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U,  // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U,  // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U,  // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U,  // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U,  // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U,  // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U,  // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U,  // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U,  // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U,  // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U,  // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U,  // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U,  // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U,  // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U,  // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U,  // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U,  // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U,  // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U,  // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U,  // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U,  // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U,  // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U,  // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U,  // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U,  // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U,  // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+   336380006U,  // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U,  // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U,  // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U,  // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U,  // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+   336380006U,  // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U,  // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U,  // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U,  // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U,  // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U,  // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U,  // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U,  // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U,  // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U,  // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U,  // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U,  // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U,  // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U,  // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U,  // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U,  // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U,  // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U,  // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U,  // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U,  // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U,  // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U,  // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U,  // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U,  // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U,  // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U,  // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U,  // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U,  // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U,  // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U,  // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U,  // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U,  // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U,  // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U,  // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U,  // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U,  // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U,  // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U,  // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U,  // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U,  // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+   336380006U,  // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U,  // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U,  // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U,  // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U,  // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+   336380006U,  // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U,  // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U,  // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U,  // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U,  // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U,  // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U,  // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U,  // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U,  // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U,  // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U,  // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U,  // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U,  // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U,  // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U,  // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U,  // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U,  // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U,  // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U,  // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U,  // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U,  // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U,  // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U,  // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U,  // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U,  // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U,  // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U,  // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U,  // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U,  // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U,  // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U,  // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U,  // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U,  // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U,  // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U,  // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U,  // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U,  // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U,  // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U,  // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U,  // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U,  // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U,  // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U,  // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U,  // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U,  // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U,  // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U,  // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U,  // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U,  // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U,  // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U,  // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U,  // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+   537709878U,  // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U,  // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+   537709896U,  // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U,  // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U,  // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U,  // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U,  // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U,  // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U,  // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U,  // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U,  // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U,  // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U,  // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U,  // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U,  // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U,  // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U,  // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U,  // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U,  // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U,  // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U,  // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U,  // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U,  // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U,  // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U,  // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U,  // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U,  // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+   537710121U,  // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U,  // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+   537710139U,  // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U,  // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U,  // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U,  // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U,  // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U,  // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U,  // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U,  // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U,  // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U,  // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U,  // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U,  // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U,  // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U,  // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U,  // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U,  // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U,  // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U,  // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U,  // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U,  // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U,  // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U,  // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U,  // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U,  // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U,  // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U,  // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U,  // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U,  // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U,  // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U,  // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U,  // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U,  // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U,  // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U,  // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U,  // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U,  // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U,  // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U,  // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U,  // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U,  // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U,  // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U,  // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U,  // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U,  // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U,  // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U,  // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U,  // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U,  // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U,  // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U,  // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U,  // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U,  // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U,  // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U,  // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U,  // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U,  // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U,  // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U,  // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U,  // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U,  // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U,  // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U,  // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U,  // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U,  // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U,  // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U,  // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U,  // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U,  // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U,  // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U,  // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U,  // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U,  // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U,  // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U,  // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U,  // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U,  // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U,  // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U,  // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U,  // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U,  // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U,  // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U,  // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U,  // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U,  // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U,  // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U,  // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U,  // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U,  // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U,  // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U,  // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U,  // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U,  // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U,  // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U,  // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U,  // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U,  // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U,  // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U,  // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U,  // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U,  // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U,  // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U,  // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U,  // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U,  // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U,  // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U,  // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U,  // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U,  // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U,  // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U,  // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U,  // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U,  // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U,  // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U,  // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U,  // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U,  // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U,  // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U,  // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U,  // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U,  // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U,  // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U,  // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U,  // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U,  // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U,  // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U,  // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U,  // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U,  // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U,  // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U,  // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U,  // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U,  // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U,  // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U,  // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U,  // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U,  // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U,  // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U,  // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U,  // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U,  // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U,  // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U,  // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U,  // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U,  // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U,  // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U,  // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U,  // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U,  // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U,  // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U,  // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U,  // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U,  // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U,  // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U,  // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U,  // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U,  // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U,  // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U,  // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U,  // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U,  // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U,  // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U,  // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U,  // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U,  // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U,  // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U,  // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U,  // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U,  // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U,  // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U,  // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U,  // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U,  // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U,  // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U,  // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U,  // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U,  // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U,  // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U,  // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U,  // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U,  // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U,  // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U,  // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U,  // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U,  // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U,  // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U,  // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U,  // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U,  // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U,  // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U,  // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U,  // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U,  // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U,  // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U,  // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U,  // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U,  // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U,  // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U,  // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U,  // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U,  // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U,  // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U,  // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U,  // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U,  // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U,  // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U,  // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U,  // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U,  // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U,  // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U,  // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U,  // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U,  // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U,  // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U,  // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U,  // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U,  // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U,  // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U,  // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U,  // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U,  // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U,  // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U,  // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U,  // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U,  // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U,  // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U,  // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U,  // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U,  // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U,  // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U,  // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U,  // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U,  // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U,  // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U,  // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U,  // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U,  // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U,  // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U,  // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U,  // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U,  // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U,  // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U,  // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U,  // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U,  // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U,  // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U,  // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U,  // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U,  // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U,  // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U,  // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U,  // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U,  // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U,  // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U,  // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U,  // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+   537712430U,  // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U,  // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U,  // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U,  // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U,  // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U,  // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+   537712484U,  // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U,  // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U,  // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U,  // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U,  // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U,  // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U,  // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U,  // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U,  // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U,  // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U,  // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U,  // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U,  // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+   336380006U,  // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U,  // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U,  // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U,  // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U,  // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+   336380006U,  // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U,  // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U,  // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U,  // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U,  // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U,  // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U,  // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U,  // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U,  // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U,  // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U,  // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U,  // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U,  // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U,  // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U,  // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U,  // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+   537712794U,  // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U,  // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+   537712812U,  // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U,  // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U,  // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U,  // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U,  // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U,  // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U,  // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U,  // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U,  // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U,  // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U,  // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U,  // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U,  // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U,  // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U,  // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U,  // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U,  // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U,  // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U,  // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U,  // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U,  // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+   537712997U,  // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+   336380006U,  // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U,  // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U,  // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+   537713037U,  // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U,  // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+   537713051U,  // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U,  // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U,  // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U,  // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U,  // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U,  // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U,  // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U,  // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U,  // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U,  // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U,  // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U,  // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U,  // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U,  // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U,  // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U,  // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U,  // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U,  // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U,  // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U,  // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U,  // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U,  // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U,  // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U,  // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U,  // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U,  // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U,  // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U,  // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U,  // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U,  // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U,  // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U,  // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U,  // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U,  // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U,  // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U,  // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U,  // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U,  // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U,  // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U,  // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U,  // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U,  // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U,  // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U,  // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U,  // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U,  // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U,  // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U,  // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U,  // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U,  // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U,  // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U,  // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U,  // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U,  // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U,  // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U,  // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U,  // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U,  // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U,  // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U,  // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U,  // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U,  // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U,  // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U,  // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U,  // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U,  // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U,  // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U,  // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U,  // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U,  // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U,  // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U,  // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U,  // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U,  // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U,  // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U,  // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U,  // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U,  // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U,  // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U,  // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U,  // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U,  // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U,  // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U,  // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U,  // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U,  // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U,  // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U,  // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U,  // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U,  // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U,  // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U,  // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U,  // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U,  // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U,  // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U,  // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U,  // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U,  // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U,  // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U,  // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U,  // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U,  // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U,  // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U,  // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U,  // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U,  // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U,  // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U,  // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U,  // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U,  // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U,  // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U,  // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U,  // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U,  // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U,  // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U,  // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U,  // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U,  // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U,  // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U,  // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U,  // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U,  // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U,  // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U,  // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U,  // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U,  // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U,  // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U,  // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U,  // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U,  // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U,  // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U,  // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U,  // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U,  // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U,  // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U,  // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U,  // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U,  // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U,  // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U,  // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U,  // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U,  // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U,  // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U,  // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U,  // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U,  // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U,  // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U,  // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U,  // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U,  // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U,  // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U,  // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U,  // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U,  // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U,  // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U,  // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U,  // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U,  // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U,  // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U,  // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U,  // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U,  // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U,  // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U,  // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U,  // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U,  // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U,  // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U,  // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U,  // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U,  // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U,  // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U,  // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U,  // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U,  // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U,  // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U,  // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U,  // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U,  // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U,  // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U,  // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U,  // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U,  // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U,  // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U,  // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U,  // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U,  // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U,  // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U,  // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U,  // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U,  // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U,  // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U,  // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U,  // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U,  // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U,  // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U,  // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U,  // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U,  // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U,  // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U,  // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U,  // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U,  // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U,  // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U,  // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U,  // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U,  // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U,  // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U,  // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U,  // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U,  // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U,  // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U,  // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U,  // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U,  // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U,  // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U,  // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U,  // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U,  // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U,  // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U,  // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U,  // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U,  // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U,  // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U,  // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U,  // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U,  // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U,  // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U,  // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U,  // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U,  // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U,  // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U,  // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U,  // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U,  // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U,  // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U,  // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U,  // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U,  // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U,  // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U,  // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U,  // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U,  // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U,  // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U,  // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U,  // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U,  // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U,  // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U,  // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U,  // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U,  // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U,  // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U,  // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U,  // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U,  // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U,  // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U,  // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U,  // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U,  // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U,  // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U,  // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U,  // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U,  // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U,  // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U,  // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U,  // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U,  // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U,  // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U,  // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U,  // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U,  // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U,  // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U,  // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U,  // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U,  // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U,  // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U,  // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U,  // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U,  // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U,  // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U,  // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U,  // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U,  // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U,  // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U,  // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U,  // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U,  // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U,  // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U,  // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U,  // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U,  // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U,  // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U,  // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U,  // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U,  // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U,  // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U,  // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U,  // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U,  // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U,  // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U,  // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U,  // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U,  // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U,  // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U,  // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U,  // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U,  // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U,  // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U,  // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U,  // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U,  // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U,  // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U,  // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U,  // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U,  // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U,  // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U,  // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U,  // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U,  // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U,  // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U,  // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U,  // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U,  // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U,  // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U,  // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U,  // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U,  // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U,  // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U,  // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U,  // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U,  // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U,  // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U,  // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U,  // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U,  // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U,  // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U,  // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U,  // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U,  // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U,  // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U,  // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U,  // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U,  // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U,  // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U,  // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U,  // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U,  // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U,  // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U,  // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U,  // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U,  // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U,  // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U,  // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U,  // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U,  // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U,  // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U,  // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U,  // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U,  // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U,  // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U,  // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U,  // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U,  // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U,  // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U,  // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U,  // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+   161926454U,  // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U,  // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U,  // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U,  // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+   161926454U,  // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U,  // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U,  // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U,  // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U,  // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U,  // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U,  // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U,  // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U,  // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U,  // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U,  // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U,  // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U,  // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U,  // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U,  // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U,  // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U,  // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U,  // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U,  // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U,  // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U,  // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U,  // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U,  // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U,  // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U,  // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U,  // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U,  // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U,  // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U,  // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U,  // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U,  // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U,  // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+   161926454U,  // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U,  // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U,  // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U,  // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+   161926454U,  // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U,  // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U,  // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U,  // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U,  // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U,  // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U,  // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U,  // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U,  // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U,  // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U,  // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U,  // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U,  // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U,  // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U,  // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U,  // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U,  // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U,  // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U,  // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U,  // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U,  // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U,  // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U,  // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U,  // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U,  // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U,  // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U,  // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U,  // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U,  // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U,  // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U,  // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U,  // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U,  // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U,  // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U,  // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U,  // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U,  // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U,  // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U,  // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U,  // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U,  // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U,  // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U,  // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U,  // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U,  // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U,  // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U,  // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U,  // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U,  // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U,  // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U,  // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U,  // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U,  // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U,  // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U,  // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U,  // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U,  // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U,  // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U,  // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U,  // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U,  // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U,  // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+    27705344U,  // <4,5,6,7>: Cost 0 copy RHS
+    27705344U,  // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U,  // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U,  // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U,  // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U,  // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U,  // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U,  // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U,  // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U,  // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U,  // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U,  // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U,  // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U,  // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U,  // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U,  // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U,  // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U,  // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+    27705344U,  // <4,5,u,7>: Cost 0 copy RHS
+    27705344U,  // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U,  // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U,  // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U,  // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U,  // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U,  // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U,  // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U,  // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U,  // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U,  // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U,  // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U,  // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U,  // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U,  // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U,  // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U,  // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U,  // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U,  // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U,  // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U,  // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U,  // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U,  // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U,  // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U,  // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U,  // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U,  // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U,  // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U,  // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U,  // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U,  // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U,  // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U,  // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U,  // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U,  // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U,  // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U,  // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U,  // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U,  // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U,  // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U,  // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U,  // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U,  // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U,  // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U,  // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U,  // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U,  // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U,  // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U,  // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U,  // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U,  // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U,  // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U,  // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U,  // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U,  // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U,  // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U,  // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U,  // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U,  // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U,  // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U,  // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U,  // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U,  // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U,  // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U,  // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U,  // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U,  // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U,  // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U,  // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U,  // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U,  // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U,  // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U,  // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U,  // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U,  // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U,  // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U,  // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U,  // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U,  // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U,  // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U,  // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U,  // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U,  // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U,  // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U,  // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U,  // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U,  // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U,  // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U,  // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U,  // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U,  // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U,  // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U,  // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U,  // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U,  // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U,  // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U,  // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U,  // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U,  // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U,  // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U,  // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U,  // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U,  // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U,  // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U,  // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U,  // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U,  // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U,  // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U,  // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U,  // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U,  // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U,  // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U,  // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U,  // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U,  // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U,  // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U,  // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U,  // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U,  // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U,  // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U,  // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U,  // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U,  // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U,  // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U,  // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U,  // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U,  // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U,  // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U,  // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U,  // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U,  // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U,  // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U,  // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U,  // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U,  // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U,  // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U,  // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U,  // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U,  // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U,  // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U,  // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U,  // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U,  // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U,  // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U,  // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U,  // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U,  // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U,  // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U,  // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U,  // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U,  // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U,  // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U,  // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U,  // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U,  // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U,  // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U,  // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U,  // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U,  // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U,  // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U,  // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U,  // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U,  // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U,  // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U,  // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U,  // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U,  // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U,  // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U,  // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U,  // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U,  // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U,  // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U,  // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U,  // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U,  // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U,  // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U,  // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U,  // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U,  // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U,  // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U,  // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U,  // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U,  // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U,  // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U,  // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U,  // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U,  // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U,  // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U,  // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U,  // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U,  // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U,  // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U,  // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U,  // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U,  // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U,  // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U,  // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U,  // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U,  // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U,  // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U,  // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U,  // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U,  // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U,  // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+   161926454U,  // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U,  // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U,  // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U,  // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+   161926454U,  // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U,  // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U,  // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U,  // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U,  // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U,  // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U,  // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U,  // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U,  // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U,  // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U,  // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U,  // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U,  // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U,  // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U,  // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U,  // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U,  // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+    27705344U,  // <4,u,6,7>: Cost 0 copy RHS
+    27705344U,  // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U,  // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U,  // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U,  // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U,  // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U,  // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U,  // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U,  // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U,  // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U,  // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U,  // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U,  // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U,  // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U,  // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+   161926454U,  // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U,  // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U,  // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+    27705344U,  // <4,u,u,7>: Cost 0 copy RHS
+    27705344U,  // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U,  // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U,  // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U,  // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U,  // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U,  // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U,  // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U,  // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U,  // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U,  // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U,  // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U,  // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U,  // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U,  // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U,  // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U,  // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U,  // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U,  // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U,  // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U,  // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U,  // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U,  // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U,  // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U,  // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U,  // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U,  // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U,  // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U,  // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U,  // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U,  // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U,  // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U,  // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U,  // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U,  // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U,  // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U,  // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U,  // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U,  // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U,  // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U,  // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U,  // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U,  // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U,  // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U,  // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U,  // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U,  // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U,  // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U,  // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U,  // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U,  // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U,  // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U,  // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U,  // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U,  // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U,  // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U,  // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U,  // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U,  // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U,  // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U,  // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U,  // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U,  // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U,  // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U,  // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U,  // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U,  // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U,  // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U,  // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U,  // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U,  // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U,  // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U,  // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U,  // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U,  // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U,  // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U,  // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U,  // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U,  // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U,  // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U,  // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U,  // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U,  // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U,  // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U,  // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U,  // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U,  // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U,  // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U,  // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U,  // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U,  // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U,  // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U,  // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U,  // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U,  // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U,  // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U,  // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U,  // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U,  // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U,  // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U,  // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U,  // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U,  // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U,  // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U,  // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U,  // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U,  // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U,  // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U,  // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U,  // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U,  // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U,  // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U,  // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U,  // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U,  // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U,  // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U,  // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U,  // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U,  // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U,  // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U,  // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U,  // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U,  // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U,  // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U,  // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U,  // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U,  // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U,  // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U,  // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U,  // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U,  // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U,  // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U,  // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U,  // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U,  // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U,  // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U,  // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U,  // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U,  // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U,  // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U,  // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U,  // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U,  // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U,  // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U,  // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U,  // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U,  // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U,  // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U,  // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U,  // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U,  // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U,  // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U,  // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U,  // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U,  // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U,  // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U,  // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U,  // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U,  // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U,  // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U,  // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U,  // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U,  // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U,  // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U,  // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U,  // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U,  // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U,  // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U,  // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U,  // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U,  // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U,  // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U,  // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U,  // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U,  // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U,  // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U,  // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U,  // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U,  // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U,  // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U,  // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U,  // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U,  // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U,  // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U,  // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U,  // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U,  // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U,  // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U,  // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U,  // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U,  // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U,  // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U,  // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U,  // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U,  // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U,  // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U,  // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U,  // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U,  // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U,  // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U,  // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U,  // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U,  // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U,  // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U,  // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U,  // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U,  // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U,  // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U,  // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U,  // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U,  // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U,  // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U,  // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U,  // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U,  // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U,  // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U,  // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U,  // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U,  // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U,  // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U,  // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U,  // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U,  // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U,  // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U,  // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U,  // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U,  // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U,  // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U,  // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U,  // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U,  // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U,  // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U,  // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U,  // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U,  // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U,  // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U,  // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U,  // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U,  // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U,  // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U,  // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U,  // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U,  // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U,  // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U,  // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U,  // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U,  // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U,  // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U,  // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U,  // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U,  // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U,  // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U,  // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U,  // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U,  // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U,  // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U,  // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U,  // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U,  // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U,  // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U,  // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U,  // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U,  // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U,  // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U,  // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U,  // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U,  // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U,  // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U,  // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U,  // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U,  // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U,  // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U,  // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U,  // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U,  // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U,  // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U,  // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U,  // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U,  // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U,  // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U,  // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U,  // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U,  // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U,  // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U,  // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U,  // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U,  // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U,  // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U,  // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U,  // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U,  // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U,  // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U,  // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U,  // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U,  // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U,  // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U,  // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U,  // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U,  // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U,  // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U,  // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U,  // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U,  // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U,  // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U,  // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U,  // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U,  // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U,  // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U,  // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U,  // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U,  // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U,  // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U,  // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U,  // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U,  // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U,  // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U,  // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U,  // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U,  // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U,  // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U,  // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U,  // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U,  // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U,  // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U,  // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U,  // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U,  // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U,  // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U,  // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U,  // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U,  // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U,  // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U,  // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U,  // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U,  // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U,  // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U,  // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U,  // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U,  // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U,  // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U,  // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U,  // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U,  // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U,  // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U,  // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U,  // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U,  // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U,  // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U,  // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U,  // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U,  // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U,  // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U,  // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U,  // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U,  // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U,  // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U,  // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U,  // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U,  // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U,  // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U,  // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U,  // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U,  // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U,  // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U,  // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U,  // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U,  // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U,  // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U,  // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U,  // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U,  // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U,  // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U,  // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U,  // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U,  // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U,  // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U,  // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U,  // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U,  // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U,  // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U,  // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U,  // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U,  // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U,  // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U,  // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U,  // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U,  // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U,  // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U,  // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U,  // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U,  // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U,  // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U,  // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U,  // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U,  // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+    94817590U,  // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U,  // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+    94965064U,  // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U,  // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U,  // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U,  // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U,  // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U,  // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U,  // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+    94825783U,  // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U,  // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+    94973257U,  // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U,  // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U,  // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U,  // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U,  // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U,  // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U,  // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U,  // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U,  // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U,  // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U,  // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U,  // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U,  // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U,  // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U,  // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U,  // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U,  // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U,  // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U,  // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U,  // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U,  // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U,  // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U,  // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U,  // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U,  // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U,  // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U,  // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U,  // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U,  // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U,  // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U,  // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U,  // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U,  // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U,  // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U,  // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U,  // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U,  // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U,  // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U,  // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U,  // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U,  // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U,  // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U,  // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U,  // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U,  // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U,  // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U,  // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U,  // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U,  // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U,  // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U,  // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+   229035318U,  // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U,  // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U,  // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+   229035318U,  // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U,  // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U,  // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U,  // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U,  // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U,  // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U,  // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U,  // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U,  // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U,  // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U,  // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U,  // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U,  // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U,  // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U,  // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U,  // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U,  // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U,  // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U,  // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U,  // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U,  // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U,  // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U,  // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U,  // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+   229035318U,  // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U,  // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U,  // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+   229035318U,  // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U,  // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U,  // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U,  // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U,  // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U,  // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U,  // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U,  // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U,  // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U,  // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U,  // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U,  // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U,  // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U,  // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U,  // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U,  // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U,  // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U,  // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U,  // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U,  // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U,  // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U,  // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U,  // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U,  // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U,  // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U,  // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U,  // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U,  // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U,  // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U,  // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U,  // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U,  // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U,  // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U,  // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U,  // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U,  // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U,  // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U,  // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U,  // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U,  // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U,  // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U,  // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U,  // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U,  // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U,  // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U,  // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U,  // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U,  // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U,  // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U,  // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U,  // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U,  // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U,  // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U,  // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U,  // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U,  // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U,  // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U,  // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U,  // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U,  // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U,  // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U,  // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U,  // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U,  // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+   430358630U,  // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U,  // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U,  // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U,  // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+   430361910U,  // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U,  // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U,  // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U,  // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+   430364462U,  // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+   430366822U,  // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U,  // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U,  // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U,  // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+   430370103U,  // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U,  // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U,  // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U,  // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+   430372654U,  // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U,  // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U,  // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U,  // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U,  // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U,  // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U,  // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U,  // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U,  // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U,  // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U,  // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U,  // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U,  // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U,  // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U,  // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U,  // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U,  // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U,  // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U,  // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U,  // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U,  // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U,  // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U,  // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U,  // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U,  // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U,  // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U,  // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U,  // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U,  // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U,  // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U,  // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U,  // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U,  // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U,  // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U,  // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U,  // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U,  // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U,  // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U,  // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U,  // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U,  // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U,  // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U,  // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U,  // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U,  // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U,  // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U,  // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U,  // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U,  // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U,  // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U,  // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U,  // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U,  // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U,  // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U,  // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U,  // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U,  // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U,  // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U,  // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U,  // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U,  // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U,  // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U,  // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U,  // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U,  // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U,  // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U,  // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U,  // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U,  // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U,  // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U,  // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U,  // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U,  // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U,  // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U,  // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U,  // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U,  // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U,  // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U,  // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U,  // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U,  // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U,  // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U,  // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U,  // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U,  // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U,  // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U,  // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U,  // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U,  // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U,  // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U,  // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U,  // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U,  // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U,  // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U,  // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U,  // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U,  // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U,  // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U,  // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U,  // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U,  // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U,  // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U,  // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U,  // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U,  // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U,  // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U,  // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U,  // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U,  // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U,  // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U,  // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U,  // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U,  // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U,  // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U,  // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U,  // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U,  // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U,  // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U,  // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U,  // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U,  // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U,  // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U,  // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U,  // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U,  // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U,  // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U,  // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U,  // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U,  // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U,  // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U,  // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U,  // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+   229035318U,  // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U,  // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U,  // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+   229035318U,  // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U,  // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U,  // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U,  // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U,  // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U,  // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U,  // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U,  // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U,  // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U,  // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+   430506086U,  // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U,  // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U,  // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U,  // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+   430509384U,  // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U,  // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+   118708378U,  // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U,  // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+   430511918U,  // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+   430514278U,  // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U,  // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U,  // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U,  // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+   430517577U,  // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+   229035318U,  // <5,u,u,5>: Cost 1 vdup1 RHS
+   118716571U,  // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U,  // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+   430520110U,  // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U,  // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U,  // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U,  // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U,  // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U,  // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U,  // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U,  // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U,  // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U,  // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U,  // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U,  // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U,  // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U,  // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U,  // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U,  // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U,  // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U,  // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U,  // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U,  // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U,  // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U,  // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U,  // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U,  // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U,  // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U,  // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U,  // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U,  // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U,  // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U,  // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U,  // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U,  // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U,  // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U,  // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U,  // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U,  // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U,  // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U,  // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U,  // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U,  // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U,  // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U,  // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U,  // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U,  // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U,  // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U,  // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U,  // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U,  // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U,  // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U,  // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U,  // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U,  // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U,  // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U,  // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U,  // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U,  // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U,  // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U,  // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U,  // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U,  // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U,  // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U,  // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U,  // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U,  // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U,  // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U,  // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U,  // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U,  // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U,  // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U,  // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U,  // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U,  // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U,  // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U,  // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U,  // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U,  // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U,  // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U,  // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U,  // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U,  // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U,  // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U,  // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U,  // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U,  // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U,  // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U,  // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U,  // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U,  // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U,  // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U,  // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U,  // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U,  // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U,  // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U,  // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U,  // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U,  // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U,  // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U,  // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U,  // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U,  // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U,  // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U,  // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U,  // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U,  // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U,  // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U,  // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U,  // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U,  // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U,  // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U,  // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U,  // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U,  // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U,  // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U,  // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U,  // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U,  // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U,  // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U,  // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U,  // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U,  // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U,  // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U,  // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U,  // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U,  // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U,  // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U,  // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U,  // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U,  // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U,  // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U,  // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U,  // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U,  // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U,  // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U,  // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U,  // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U,  // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U,  // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U,  // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U,  // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U,  // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U,  // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U,  // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U,  // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U,  // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U,  // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U,  // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U,  // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U,  // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U,  // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U,  // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U,  // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U,  // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U,  // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U,  // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U,  // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U,  // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U,  // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U,  // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U,  // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U,  // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U,  // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U,  // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U,  // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U,  // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U,  // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U,  // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U,  // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U,  // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U,  // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U,  // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U,  // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U,  // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U,  // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U,  // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U,  // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U,  // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U,  // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U,  // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U,  // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U,  // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U,  // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U,  // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U,  // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U,  // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U,  // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U,  // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U,  // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U,  // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U,  // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U,  // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U,  // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U,  // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U,  // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U,  // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U,  // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U,  // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U,  // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U,  // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U,  // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U,  // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U,  // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U,  // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U,  // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U,  // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U,  // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U,  // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U,  // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U,  // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U,  // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U,  // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U,  // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U,  // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U,  // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U,  // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U,  // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U,  // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U,  // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U,  // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U,  // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U,  // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U,  // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U,  // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U,  // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U,  // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U,  // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U,  // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U,  // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U,  // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U,  // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U,  // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U,  // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U,  // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U,  // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U,  // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U,  // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U,  // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U,  // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U,  // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U,  // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U,  // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U,  // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U,  // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U,  // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U,  // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U,  // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U,  // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U,  // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U,  // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U,  // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U,  // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U,  // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U,  // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U,  // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U,  // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U,  // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U,  // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U,  // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U,  // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U,  // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U,  // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U,  // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U,  // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U,  // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U,  // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U,  // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U,  // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U,  // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U,  // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U,  // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U,  // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U,  // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U,  // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U,  // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U,  // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U,  // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U,  // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U,  // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U,  // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U,  // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U,  // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U,  // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U,  // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U,  // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U,  // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U,  // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U,  // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U,  // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U,  // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U,  // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U,  // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U,  // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U,  // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U,  // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U,  // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U,  // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U,  // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U,  // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U,  // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U,  // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U,  // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U,  // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U,  // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U,  // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U,  // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U,  // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U,  // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U,  // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U,  // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U,  // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U,  // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U,  // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U,  // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U,  // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U,  // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U,  // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U,  // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U,  // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U,  // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U,  // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U,  // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U,  // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U,  // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U,  // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U,  // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U,  // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U,  // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U,  // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U,  // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U,  // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U,  // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U,  // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U,  // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U,  // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U,  // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U,  // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U,  // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U,  // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U,  // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U,  // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U,  // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U,  // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U,  // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U,  // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U,  // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U,  // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U,  // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U,  // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U,  // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U,  // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U,  // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U,  // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U,  // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U,  // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U,  // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U,  // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U,  // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U,  // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U,  // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U,  // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U,  // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U,  // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U,  // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U,  // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U,  // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U,  // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U,  // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U,  // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U,  // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U,  // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U,  // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U,  // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U,  // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U,  // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U,  // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U,  // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U,  // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U,  // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U,  // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U,  // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U,  // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U,  // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U,  // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U,  // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U,  // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U,  // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U,  // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U,  // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U,  // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U,  // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U,  // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U,  // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U,  // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U,  // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U,  // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U,  // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U,  // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U,  // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U,  // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U,  // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U,  // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U,  // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U,  // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U,  // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U,  // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U,  // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U,  // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U,  // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U,  // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U,  // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U,  // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U,  // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U,  // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U,  // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U,  // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U,  // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U,  // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U,  // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U,  // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U,  // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U,  // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U,  // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U,  // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U,  // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U,  // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U,  // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U,  // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U,  // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U,  // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U,  // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U,  // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U,  // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U,  // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U,  // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U,  // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U,  // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U,  // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U,  // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U,  // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U,  // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U,  // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U,  // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U,  // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U,  // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U,  // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U,  // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U,  // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U,  // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U,  // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U,  // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U,  // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U,  // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U,  // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U,  // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U,  // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U,  // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U,  // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U,  // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U,  // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U,  // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U,  // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U,  // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U,  // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U,  // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U,  // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U,  // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U,  // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U,  // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U,  // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U,  // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U,  // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U,  // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U,  // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U,  // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U,  // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U,  // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U,  // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U,  // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U,  // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U,  // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U,  // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U,  // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U,  // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U,  // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U,  // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U,  // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U,  // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U,  // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U,  // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U,  // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U,  // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U,  // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U,  // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U,  // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U,  // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U,  // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U,  // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U,  // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U,  // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U,  // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U,  // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U,  // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U,  // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U,  // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U,  // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U,  // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U,  // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U,  // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U,  // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U,  // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U,  // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U,  // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U,  // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U,  // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U,  // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U,  // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U,  // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U,  // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U,  // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U,  // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U,  // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U,  // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U,  // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U,  // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U,  // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U,  // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U,  // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U,  // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U,  // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U,  // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U,  // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U,  // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U,  // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U,  // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U,  // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U,  // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U,  // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U,  // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U,  // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U,  // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U,  // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U,  // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U,  // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U,  // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U,  // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U,  // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U,  // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U,  // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+   296144182U,  // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U,  // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+   296144182U,  // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U,  // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U,  // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U,  // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U,  // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U,  // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U,  // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U,  // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U,  // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U,  // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U,  // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U,  // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U,  // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U,  // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U,  // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U,  // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+   296144182U,  // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U,  // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+   296144182U,  // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U,  // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+   497467494U,  // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U,  // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U,  // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U,  // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U,  // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U,  // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U,  // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+   497468061U,  // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U,  // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U,  // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U,  // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U,  // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U,  // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U,  // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U,  // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U,  // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U,  // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U,  // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U,  // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U,  // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U,  // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U,  // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U,  // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U,  // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U,  // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U,  // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U,  // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U,  // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U,  // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U,  // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U,  // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U,  // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U,  // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U,  // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U,  // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U,  // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U,  // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U,  // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U,  // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U,  // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+   497470774U,  // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U,  // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U,  // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+   497471017U,  // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U,  // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U,  // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U,  // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U,  // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U,  // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U,  // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U,  // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U,  // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U,  // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U,  // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U,  // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U,  // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U,  // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U,  // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U,  // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U,  // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U,  // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U,  // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U,  // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U,  // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U,  // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U,  // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U,  // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U,  // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U,  // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U,  // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U,  // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U,  // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+   497473326U,  // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U,  // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U,  // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U,  // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+   497473690U,  // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U,  // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U,  // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+   497473893U,  // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U,  // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+   497475686U,  // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U,  // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U,  // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U,  // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U,  // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U,  // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U,  // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+   497476253U,  // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U,  // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U,  // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U,  // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U,  // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U,  // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U,  // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U,  // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U,  // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U,  // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U,  // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U,  // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U,  // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U,  // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U,  // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U,  // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U,  // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U,  // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U,  // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U,  // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U,  // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U,  // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U,  // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U,  // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U,  // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U,  // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U,  // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U,  // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U,  // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U,  // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U,  // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U,  // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U,  // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+   497478967U,  // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U,  // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U,  // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+   497479209U,  // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U,  // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U,  // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U,  // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U,  // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U,  // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U,  // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U,  // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U,  // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U,  // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U,  // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U,  // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U,  // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U,  // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U,  // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U,  // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+   296144182U,  // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U,  // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+   296144182U,  // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U,  // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U,  // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U,  // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U,  // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U,  // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U,  // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U,  // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U,  // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U,  // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U,  // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+   497481518U,  // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U,  // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U,  // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U,  // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+   497481882U,  // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+   296144182U,  // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U,  // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+   497482085U,  // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U,  // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U,  // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U,  // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U,  // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U,  // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U,  // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U,  // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U,  // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U,  // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U,  // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U,  // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+   564576358U,  // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U,  // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U,  // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U,  // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U,  // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U,  // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+   564576412U,  // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U,  // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U,  // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U,  // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U,  // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U,  // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U,  // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U,  // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U,  // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U,  // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U,  // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U,  // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U,  // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U,  // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U,  // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U,  // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U,  // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U,  // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U,  // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U,  // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U,  // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U,  // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U,  // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U,  // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U,  // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U,  // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U,  // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U,  // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U,  // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U,  // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U,  // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U,  // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U,  // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U,  // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U,  // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U,  // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U,  // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U,  // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U,  // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U,  // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U,  // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U,  // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U,  // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U,  // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U,  // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U,  // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U,  // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U,  // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U,  // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U,  // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U,  // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U,  // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U,  // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U,  // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U,  // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U,  // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U,  // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+   564576925U,  // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U,  // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U,  // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U,  // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U,  // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U,  // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+   564576979U,  // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U,  // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U,  // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U,  // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U,  // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U,  // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U,  // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U,  // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U,  // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U,  // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U,  // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U,  // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U,  // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U,  // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U,  // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U,  // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U,  // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U,  // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U,  // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U,  // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U,  // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U,  // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U,  // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U,  // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U,  // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U,  // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U,  // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U,  // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U,  // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U,  // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U,  // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U,  // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U,  // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U,  // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U,  // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U,  // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U,  // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U,  // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U,  // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U,  // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U,  // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U,  // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U,  // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U,  // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U,  // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U,  // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U,  // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U,  // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U,  // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U,  // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U,  // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U,  // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U,  // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U,  // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U,  // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U,  // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U,  // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U,  // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U,  // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U,  // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U,  // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U,  // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U,  // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U,  // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U,  // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U,  // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U,  // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U,  // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U,  // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U,  // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U,  // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U,  // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U,  // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U,  // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U,  // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U,  // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U,  // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U,  // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U,  // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U,  // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U,  // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U,  // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U,  // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U,  // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U,  // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U,  // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U,  // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U,  // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U,  // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U,  // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U,  // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U,  // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U,  // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U,  // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U,  // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U,  // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U,  // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U,  // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U,  // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U,  // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U,  // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U,  // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U,  // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U,  // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U,  // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U,  // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U,  // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U,  // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U,  // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U,  // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U,  // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U,  // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U,  // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U,  // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U,  // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U,  // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U,  // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U,  // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U,  // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U,  // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U,  // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U,  // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U,  // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U,  // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U,  // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U,  // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U,  // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U,  // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U,  // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U,  // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U,  // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U,  // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U,  // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U,  // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U,  // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U,  // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U,  // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U,  // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U,  // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U,  // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U,  // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U,  // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U,  // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U,  // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U,  // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U,  // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U,  // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U,  // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U,  // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U,  // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U,  // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U,  // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U,  // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U,  // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U,  // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U,  // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U,  // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U,  // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U,  // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U,  // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U,  // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U,  // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U,  // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U,  // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U,  // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U,  // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U,  // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U,  // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U,  // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U,  // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U,  // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U,  // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U,  // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U,  // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U,  // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U,  // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U,  // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U,  // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U,  // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U,  // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U,  // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U,  // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U,  // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U,  // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U,  // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U,  // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U,  // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U,  // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U,  // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U,  // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U,  // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U,  // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U,  // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U,  // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U,  // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U,  // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U,  // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U,  // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U,  // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U,  // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U,  // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U,  // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U,  // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U,  // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U,  // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U,  // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U,  // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U,  // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U,  // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U,  // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U,  // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U,  // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U,  // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U,  // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U,  // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U,  // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U,  // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U,  // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U,  // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U,  // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U,  // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U,  // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U,  // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U,  // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U,  // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U,  // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U,  // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U,  // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U,  // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U,  // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U,  // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U,  // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U,  // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U,  // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U,  // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U,  // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U,  // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U,  // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U,  // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U,  // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U,  // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U,  // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U,  // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U,  // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U,  // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U,  // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U,  // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U,  // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U,  // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U,  // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U,  // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U,  // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U,  // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U,  // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U,  // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U,  // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U,  // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U,  // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U,  // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U,  // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U,  // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U,  // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U,  // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U,  // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U,  // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U,  // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U,  // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U,  // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U,  // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U,  // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U,  // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U,  // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U,  // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U,  // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U,  // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U,  // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U,  // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U,  // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U,  // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U,  // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U,  // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U,  // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U,  // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U,  // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U,  // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U,  // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U,  // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U,  // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U,  // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U,  // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U,  // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U,  // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U,  // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U,  // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U,  // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+   564579638U,  // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U,  // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+   564579656U,  // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U,  // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U,  // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U,  // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U,  // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U,  // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U,  // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U,  // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U,  // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U,  // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U,  // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U,  // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U,  // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U,  // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U,  // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U,  // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U,  // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U,  // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U,  // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U,  // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U,  // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U,  // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U,  // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U,  // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U,  // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+   564579881U,  // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U,  // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+   564579899U,  // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U,  // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U,  // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U,  // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U,  // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U,  // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U,  // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U,  // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U,  // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U,  // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U,  // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U,  // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U,  // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U,  // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U,  // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U,  // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U,  // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U,  // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U,  // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U,  // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U,  // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U,  // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U,  // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U,  // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U,  // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U,  // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U,  // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U,  // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U,  // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U,  // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U,  // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U,  // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U,  // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U,  // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U,  // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U,  // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U,  // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U,  // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U,  // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U,  // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U,  // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U,  // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U,  // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U,  // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U,  // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U,  // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U,  // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U,  // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U,  // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U,  // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U,  // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U,  // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U,  // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U,  // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U,  // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U,  // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U,  // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U,  // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U,  // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U,  // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U,  // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U,  // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U,  // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U,  // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U,  // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U,  // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U,  // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U,  // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U,  // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U,  // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U,  // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U,  // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U,  // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U,  // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U,  // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U,  // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U,  // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U,  // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U,  // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U,  // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U,  // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U,  // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U,  // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U,  // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U,  // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U,  // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U,  // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U,  // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U,  // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U,  // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U,  // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U,  // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U,  // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U,  // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U,  // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U,  // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U,  // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U,  // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U,  // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U,  // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U,  // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U,  // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U,  // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U,  // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U,  // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U,  // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U,  // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U,  // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U,  // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U,  // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U,  // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U,  // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U,  // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U,  // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U,  // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U,  // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U,  // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U,  // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U,  // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U,  // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U,  // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U,  // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U,  // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U,  // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U,  // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U,  // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U,  // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U,  // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U,  // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U,  // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U,  // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U,  // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U,  // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U,  // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U,  // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U,  // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U,  // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U,  // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U,  // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U,  // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U,  // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U,  // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U,  // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U,  // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U,  // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U,  // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U,  // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U,  // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U,  // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U,  // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U,  // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U,  // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U,  // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U,  // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U,  // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U,  // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U,  // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U,  // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U,  // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U,  // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U,  // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U,  // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U,  // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U,  // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U,  // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U,  // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U,  // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U,  // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U,  // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U,  // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U,  // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U,  // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U,  // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U,  // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U,  // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U,  // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U,  // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U,  // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U,  // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U,  // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U,  // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U,  // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U,  // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U,  // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U,  // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U,  // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U,  // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U,  // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U,  // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U,  // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U,  // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U,  // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U,  // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U,  // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U,  // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U,  // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U,  // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U,  // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U,  // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U,  // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U,  // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U,  // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U,  // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U,  // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U,  // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U,  // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U,  // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U,  // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U,  // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U,  // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U,  // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U,  // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U,  // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U,  // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U,  // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U,  // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U,  // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U,  // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U,  // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U,  // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U,  // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U,  // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U,  // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U,  // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U,  // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U,  // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U,  // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U,  // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U,  // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U,  // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U,  // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U,  // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U,  // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+   363253046U,  // <7,7,7,7>: Cost 1 vdup3 RHS
+   363253046U,  // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U,  // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U,  // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U,  // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U,  // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U,  // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U,  // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U,  // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+   363253046U,  // <7,7,u,7>: Cost 1 vdup3 RHS
+   363253046U,  // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U,  // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U,  // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U,  // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U,  // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U,  // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U,  // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U,  // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U,  // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U,  // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U,  // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U,  // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+   564582190U,  // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U,  // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U,  // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U,  // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U,  // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U,  // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+   564582244U,  // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U,  // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U,  // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U,  // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U,  // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U,  // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U,  // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U,  // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U,  // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U,  // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U,  // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U,  // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U,  // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U,  // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U,  // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U,  // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U,  // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U,  // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U,  // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U,  // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U,  // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U,  // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U,  // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U,  // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U,  // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U,  // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U,  // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U,  // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U,  // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U,  // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U,  // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U,  // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U,  // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U,  // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+   564582554U,  // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U,  // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+   564582572U,  // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U,  // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U,  // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U,  // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U,  // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U,  // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U,  // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U,  // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U,  // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U,  // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U,  // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U,  // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U,  // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U,  // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U,  // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U,  // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U,  // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+   363253046U,  // <7,u,7,7>: Cost 1 vdup3 RHS
+   363253046U,  // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U,  // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U,  // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+   564582757U,  // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U,  // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U,  // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U,  // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+   564582797U,  // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+   363253046U,  // <7,u,u,7>: Cost 1 vdup3 RHS
+   564582811U,  // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+   135053414U,  // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U,  // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U,  // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U,  // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U,  // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U,  // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U,  // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U,  // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+   135053414U,  // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U,  // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U,  // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+   537747563U,  // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U,  // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U,  // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U,  // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U,  // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U,  // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+   537747612U,  // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U,  // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U,  // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U,  // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U,  // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U,  // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U,  // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U,  // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U,  // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U,  // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U,  // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U,  // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+    72589981U,  // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U,  // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U,  // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U,  // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U,  // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U,  // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+    73032403U,  // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U,  // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U,  // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U,  // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U,  // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U,  // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U,  // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U,  // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U,  // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U,  // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U,  // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U,  // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U,  // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U,  // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U,  // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U,  // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U,  // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U,  // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U,  // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U,  // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U,  // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U,  // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U,  // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U,  // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U,  // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U,  // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U,  // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U,  // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U,  // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U,  // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U,  // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U,  // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U,  // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U,  // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U,  // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U,  // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U,  // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+   135053414U,  // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U,  // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+   537748125U,  // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U,  // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U,  // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U,  // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U,  // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U,  // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+   537748179U,  // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U,  // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U,  // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U,  // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U,  // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U,  // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U,  // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U,  // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U,  // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U,  // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U,  // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+   202162278U,  // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U,  // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U,  // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U,  // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U,  // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U,  // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U,  // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+   202162278U,  // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U,  // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U,  // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U,  // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+      835584U,  // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U,  // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U,  // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U,  // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U,  // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+      835584U,  // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U,  // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U,  // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U,  // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U,  // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U,  // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U,  // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U,  // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U,  // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U,  // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U,  // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U,  // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U,  // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U,  // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U,  // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U,  // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U,  // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U,  // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U,  // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U,  // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U,  // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U,  // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U,  // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U,  // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U,  // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U,  // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U,  // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U,  // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U,  // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U,  // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U,  // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U,  // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U,  // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U,  // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U,  // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U,  // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U,  // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U,  // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U,  // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U,  // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U,  // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U,  // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U,  // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U,  // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U,  // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U,  // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U,  // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+   202162278U,  // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U,  // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+      835584U,  // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U,  // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U,  // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U,  // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U,  // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+      835584U,  // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U,  // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U,  // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U,  // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U,  // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U,  // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U,  // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U,  // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U,  // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U,  // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U,  // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U,  // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U,  // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U,  // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U,  // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U,  // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U,  // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U,  // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U,  // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U,  // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U,  // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+   269271142U,  // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U,  // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U,  // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U,  // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U,  // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U,  // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+   269271142U,  // <u,2,2,u>: Cost 1 vdup2 LHS
+   408134301U,  // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U,  // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U,  // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U,  // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+   408137014U,  // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U,  // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U,  // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U,  // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+   408139566U,  // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U,  // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U,  // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U,  // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U,  // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U,  // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U,  // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U,  // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U,  // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U,  // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U,  // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U,  // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U,  // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U,  // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U,  // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U,  // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U,  // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U,  // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U,  // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U,  // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U,  // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U,  // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U,  // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U,  // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U,  // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U,  // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U,  // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U,  // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U,  // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U,  // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U,  // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U,  // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U,  // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U,  // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U,  // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U,  // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U,  // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+   408175266U,  // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U,  // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+   269271142U,  // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U,  // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+   408177974U,  // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U,  // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U,  // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U,  // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+   408180526U,  // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U,  // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+   471040156U,  // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U,  // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U,  // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U,  // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U,  // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U,  // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U,  // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+   471040669U,  // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U,  // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U,  // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U,  // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U,  // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U,  // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U,  // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U,  // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U,  // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U,  // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U,  // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U,  // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U,  // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U,  // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U,  // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U,  // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U,  // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U,  // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U,  // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U,  // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U,  // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U,  // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+   336380006U,  // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U,  // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U,  // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U,  // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U,  // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+   336380006U,  // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U,  // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U,  // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U,  // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U,  // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U,  // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+   471043382U,  // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U,  // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U,  // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+   471043625U,  // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U,  // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U,  // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U,  // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U,  // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U,  // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U,  // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U,  // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U,  // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U,  // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U,  // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U,  // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U,  // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U,  // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U,  // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U,  // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U,  // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U,  // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U,  // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U,  // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U,  // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U,  // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U,  // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U,  // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U,  // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U,  // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U,  // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U,  // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U,  // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+   471045934U,  // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U,  // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+   336380006U,  // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U,  // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+   471046298U,  // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U,  // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U,  // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+   471046501U,  // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U,  // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U,  // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U,  // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U,  // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U,  // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U,  // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U,  // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U,  // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U,  // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U,  // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U,  // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U,  // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U,  // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U,  // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U,  // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U,  // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U,  // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U,  // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U,  // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U,  // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U,  // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U,  // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U,  // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U,  // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U,  // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U,  // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U,  // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U,  // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U,  // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U,  // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U,  // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U,  // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U,  // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U,  // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U,  // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U,  // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U,  // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U,  // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U,  // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U,  // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+   161926454U,  // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U,  // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U,  // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U,  // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+   161926454U,  // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U,  // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U,  // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U,  // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U,  // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U,  // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U,  // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+   537750838U,  // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U,  // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+   537750856U,  // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U,  // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U,  // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U,  // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U,  // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U,  // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U,  // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U,  // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U,  // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U,  // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U,  // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U,  // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U,  // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U,  // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U,  // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U,  // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+    96808489U,  // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U,  // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+    96955963U,  // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U,  // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U,  // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U,  // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U,  // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+   161926454U,  // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U,  // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+   537751081U,  // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U,  // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+   537751099U,  // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U,  // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U,  // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U,  // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U,  // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U,  // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U,  // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U,  // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U,  // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U,  // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U,  // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U,  // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U,  // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U,  // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U,  // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U,  // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U,  // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U,  // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U,  // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U,  // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U,  // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U,  // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U,  // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U,  // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U,  // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U,  // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U,  // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U,  // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U,  // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U,  // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U,  // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U,  // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U,  // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U,  // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U,  // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U,  // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U,  // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U,  // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U,  // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U,  // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U,  // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U,  // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U,  // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U,  // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U,  // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U,  // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U,  // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U,  // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U,  // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U,  // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U,  // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+   229035318U,  // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U,  // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U,  // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+   229035318U,  // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U,  // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U,  // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U,  // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U,  // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U,  // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U,  // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U,  // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+    27705344U,  // <u,5,6,7>: Cost 0 copy RHS
+    27705344U,  // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U,  // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U,  // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U,  // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U,  // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U,  // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U,  // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U,  // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U,  // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U,  // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U,  // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U,  // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U,  // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U,  // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U,  // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+   229035318U,  // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U,  // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+    27705344U,  // <u,5,u,7>: Cost 0 copy RHS
+    27705344U,  // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U,  // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U,  // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U,  // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U,  // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U,  // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U,  // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U,  // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U,  // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U,  // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U,  // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U,  // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U,  // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U,  // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U,  // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U,  // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U,  // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U,  // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U,  // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U,  // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U,  // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U,  // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U,  // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U,  // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U,  // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U,  // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U,  // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U,  // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U,  // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U,  // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U,  // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U,  // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U,  // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U,  // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U,  // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U,  // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U,  // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U,  // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U,  // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U,  // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U,  // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U,  // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U,  // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U,  // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U,  // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U,  // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U,  // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U,  // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U,  // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U,  // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U,  // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U,  // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U,  // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U,  // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U,  // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U,  // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U,  // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U,  // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U,  // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U,  // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U,  // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+   296144182U,  // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U,  // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+   296144182U,  // <u,6,6,u>: Cost 1 vdup2 RHS
+   432349286U,  // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U,  // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U,  // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U,  // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+   432352809U,  // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U,  // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U,  // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U,  // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+   432355118U,  // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+   432357478U,  // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U,  // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U,  // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U,  // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+   432361002U,  // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U,  // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+   296144182U,  // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U,  // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+   432363310U,  // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U,  // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+   497614950U,  // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U,  // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U,  // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U,  // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U,  // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U,  // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U,  // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+   497615517U,  // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U,  // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U,  // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U,  // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U,  // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U,  // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U,  // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U,  // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U,  // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U,  // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U,  // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U,  // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U,  // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U,  // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U,  // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U,  // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U,  // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U,  // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U,  // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U,  // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U,  // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U,  // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U,  // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U,  // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U,  // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U,  // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U,  // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U,  // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U,  // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U,  // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U,  // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U,  // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U,  // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+   497618248U,  // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U,  // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U,  // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+   497618473U,  // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U,  // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U,  // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U,  // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U,  // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U,  // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U,  // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U,  // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U,  // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U,  // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U,  // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U,  // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U,  // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U,  // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U,  // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U,  // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U,  // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U,  // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U,  // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U,  // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U,  // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U,  // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U,  // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U,  // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U,  // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U,  // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+   363253046U,  // <u,7,7,7>: Cost 1 vdup3 RHS
+   363253046U,  // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U,  // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+   497620782U,  // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U,  // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U,  // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U,  // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+   497621146U,  // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U,  // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+   363253046U,  // <u,7,u,7>: Cost 1 vdup3 RHS
+   497621349U,  // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+   135053414U,  // <u,u,0,0>: Cost 1 vdup0 LHS
+   471081121U,  // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U,  // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U,  // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U,  // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U,  // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U,  // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U,  // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+   471081629U,  // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U,  // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+   202162278U,  // <u,u,1,1>: Cost 1 vdup1 LHS
+   537753390U,  // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U,  // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U,  // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U,  // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U,  // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U,  // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+   537753444U,  // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U,  // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U,  // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+   269271142U,  // <u,u,2,2>: Cost 1 vdup2 LHS
+      835584U,  // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U,  // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U,  // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U,  // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U,  // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+      835584U,  // <u,u,2,u>: Cost 0 copy LHS
+   408576723U,  // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U,  // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+   120371557U,  // <u,u,3,2>: Cost 1 vrev LHS
+   336380006U,  // <u,u,3,3>: Cost 1 vdup3 LHS
+   408579382U,  // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U,  // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U,  // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U,  // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+   408581934U,  // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U,  // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U,  // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U,  // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U,  // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+   161926454U,  // <u,u,4,4>: Cost 1 vdup0 RHS
+   471084342U,  // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U,  // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U,  // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+   471084585U,  // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U,  // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U,  // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U,  // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U,  // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U,  // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+   229035318U,  // <u,u,5,5>: Cost 1 vdup1 RHS
+   537753754U,  // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U,  // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+   537753772U,  // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U,  // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U,  // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U,  // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U,  // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U,  // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U,  // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+   296144182U,  // <u,u,6,6>: Cost 1 vdup2 RHS
+    27705344U,  // <u,u,6,7>: Cost 0 copy RHS
+    27705344U,  // <u,u,6,u>: Cost 0 copy RHS
+   432496742U,  // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U,  // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U,  // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U,  // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+   432500283U,  // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U,  // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+   120699277U,  // <u,u,7,6>: Cost 1 vrev RHS
+   363253046U,  // <u,u,7,7>: Cost 1 vdup3 RHS
+   432502574U,  // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+   408617688U,  // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+   471086894U,  // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+   537753957U,  // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+      835584U,  // <u,u,u,3>: Cost 0 copy LHS
+   408620342U,  // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+   471087258U,  // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+   537753997U,  // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+    27705344U,  // <u,u,u,7>: Cost 0 copy RHS
+      835584U,  // <u,u,u,u>: Cost 0 copy LHS
   0
 };
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
index d5bc3f6..ad51bc1 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/BitVector.h"
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 305b232..22d15b5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -201,6 +201,10 @@ def CPSR    : ARMReg<0, "cpsr">;
 def FPSCR   : ARMReg<1, "fpscr">;
 def ITSTATE : ARMReg<2, "itstate">;
 
+// Special Registers - only available in privileged mode.
+def FPSID   : ARMReg<0, "fpsid">;
+def FPEXC   : ARMReg<8, "fpexc">;
+
 // Register classes.
 //
 // pc  == Program Counter
@@ -256,7 +260,7 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
 
 // restricted GPR register class. Many Thumb2 instructions allow the full
 // register range for operands, but have undefined behaviours when PC
-// or SP (R13 or R15) are used. The ARM ARM refers to these operands
+// or SP (R13 or R15) are used. The ARM ISA refers to these operands
 // via the BadReg() pseudo-code description.
 def rGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
                                             R7, R8, R9, R10, R11, R12, LR]> {
@@ -381,27 +385,29 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // VFP2
+    // VFP2 / VFPv3-D16
     static const unsigned ARM_DPR_VFP2[] = {
       ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
       ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
       ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
       ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
-    // VFP3
+    // VFP3: D8-D15 are callee saved and should be allocated last.
+    // Save other low registers for use as DPR_VFP2 and DPR_8 classes.
     static const unsigned ARM_DPR_VFP3[] = {
-      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
-      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
-      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
-      ARM::D12, ARM::D13, ARM::D14, ARM::D15,
       ARM::D16, ARM::D17, ARM::D18, ARM::D19,
       ARM::D20, ARM::D21, ARM::D22, ARM::D23,
       ARM::D24, ARM::D25, ARM::D26, ARM::D27,
-      ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
+      ARM::D28, ARM::D29, ARM::D30, ARM::D31,
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
+
     DPRClass::iterator
     DPRClass::allocation_order_begin(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
-      if (Subtarget.hasVFP3())
+      if (Subtarget.hasVFP3() && !Subtarget.hasD16())
         return ARM_DPR_VFP3;
       return ARM_DPR_VFP2;
     }
@@ -410,7 +416,7 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
     DPRClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
-      if (Subtarget.hasVFP3())
+      if (Subtarget.hasVFP3() && !Subtarget.hasD16())
         return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned));
       else
         return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned));
@@ -438,6 +444,29 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
                         [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
                          Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15]> {
   let SubRegClasses = [(DPR dsub_0, dsub_1)];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Q4-Q7 are callee saved and should be allocated last.
+    // Save other low registers for use as QPR_VFP2 and QPR_8 classes.
+    static const unsigned ARM_QPR[] = {
+      ARM::Q8,  ARM::Q9,  ARM::Q10, ARM::Q11,
+      ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15,
+      ARM::Q0,  ARM::Q1,  ARM::Q2,  ARM::Q3,
+      ARM::Q4,  ARM::Q5,  ARM::Q6,  ARM::Q7 };
+
+    QPRClass::iterator
+    QPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      return ARM_QPR;
+    }
+
+    QPRClass::iterator
+    QPRClass::allocation_order_end(const MachineFunction &MF) const {
+      return ARM_QPR + (sizeof(ARM_QPR)/sizeof(unsigned));
+    }
+  }];
 }
 
 // Subset of QPR that have 32-bit SPR subregs.
@@ -463,6 +492,27 @@ def QQPR : RegisterClass<"ARM", [v4i64],
                          [QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> {
   let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
                        (QPR qsub_0, qsub_1)];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // QQ2-QQ3 are callee saved and should be allocated last.
+    // Save other low registers for use as QPR_VFP2 and QPR_8 classes.
+    static const unsigned ARM_QQPR[] = {
+      ARM::QQ4, ARM::QQ5, ARM::QQ6, ARM::QQ7,
+      ARM::QQ0, ARM::QQ1, ARM::QQ2, ARM::QQ3 };
+
+    QQPRClass::iterator
+    QQPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      return ARM_QQPR;
+    }
+
+    QQPRClass::iterator
+    QQPRClass::allocation_order_end(const MachineFunction &MF) const {
+      return ARM_QQPR + (sizeof(ARM_QQPR)/sizeof(unsigned));
+    }
+  }];
 }
 
 // Subset of QQPR that have 32-bit SPR subregs.
@@ -483,6 +533,26 @@ def QQQQPR : RegisterClass<"ARM", [v8i64],
   let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
                             dsub_4, dsub_5, dsub_6, dsub_7),
                        (QPR qsub_0, qsub_1, qsub_2, qsub_3)];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // QQQQ1 is callee saved and should be allocated last.
+    // Save QQQQ0 for use as QPR_VFP2 and QPR_8 classes.
+    static const unsigned ARM_QQQQPR[] = {
+      ARM::QQQQ2, ARM::QQQQ3, ARM::QQQQ0, ARM::QQQQ1 };
+
+    QQQQPRClass::iterator
+    QQQQPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      return ARM_QQQQPR;
+    }
+
+    QQQQPRClass::iterator
+    QQQQPRClass::allocation_order_end(const MachineFunction &MF) const {
+      return ARM_QQQQPR + (sizeof(ARM_QQQQPR)/sizeof(unsigned));
+    }
+  }];
 }
 
 // Condition code registers.
diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
index b60ccca..958c5c6 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSchedule.td
+++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
@@ -14,42 +14,86 @@ def IIC_iALUx      : InstrItinClass;
 def IIC_iALUi      : InstrItinClass;
 def IIC_iALUr      : InstrItinClass;
 def IIC_iALUsi     : InstrItinClass;
+def IIC_iALUsir    : InstrItinClass;
 def IIC_iALUsr     : InstrItinClass;
+def IIC_iBITi      : InstrItinClass;
+def IIC_iBITr      : InstrItinClass;
+def IIC_iBITsi     : InstrItinClass;
+def IIC_iBITsr     : InstrItinClass;
 def IIC_iUNAr      : InstrItinClass;
 def IIC_iUNAsi     : InstrItinClass;
-def IIC_iUNAsr     : InstrItinClass;
+def IIC_iEXTr      : InstrItinClass;
+def IIC_iEXTAr     : InstrItinClass;
+def IIC_iEXTAsr    : InstrItinClass;
 def IIC_iCMPi      : InstrItinClass;
 def IIC_iCMPr      : InstrItinClass;
 def IIC_iCMPsi     : InstrItinClass;
 def IIC_iCMPsr     : InstrItinClass;
+def IIC_iTSTi      : InstrItinClass;
+def IIC_iTSTr      : InstrItinClass;
+def IIC_iTSTsi     : InstrItinClass;
+def IIC_iTSTsr     : InstrItinClass;
 def IIC_iMOVi      : InstrItinClass;
 def IIC_iMOVr      : InstrItinClass;
 def IIC_iMOVsi     : InstrItinClass;
 def IIC_iMOVsr     : InstrItinClass;
+def IIC_iMOVix2    : InstrItinClass;
+def IIC_iMOVix2addpc : InstrItinClass;
+def IIC_iMOVix2ld  : InstrItinClass;
+def IIC_iMVNi      : InstrItinClass;
+def IIC_iMVNr      : InstrItinClass;
+def IIC_iMVNsi     : InstrItinClass;
+def IIC_iMVNsr     : InstrItinClass;
 def IIC_iCMOVi     : InstrItinClass;
 def IIC_iCMOVr     : InstrItinClass;
 def IIC_iCMOVsi    : InstrItinClass;
 def IIC_iCMOVsr    : InstrItinClass;
+def IIC_iCMOVix2   : InstrItinClass;
 def IIC_iMUL16     : InstrItinClass;
 def IIC_iMAC16     : InstrItinClass;
 def IIC_iMUL32     : InstrItinClass;
 def IIC_iMAC32     : InstrItinClass;
 def IIC_iMUL64     : InstrItinClass;
 def IIC_iMAC64     : InstrItinClass;
-def IIC_iLoadi     : InstrItinClass;
-def IIC_iLoadr     : InstrItinClass;
-def IIC_iLoadsi    : InstrItinClass;
-def IIC_iLoadiu    : InstrItinClass;
-def IIC_iLoadru    : InstrItinClass;
-def IIC_iLoadsiu   : InstrItinClass;
-def IIC_iLoadm     : InstrItinClass;
-def IIC_iStorei    : InstrItinClass;
-def IIC_iStorer    : InstrItinClass;
-def IIC_iStoresi   : InstrItinClass;
-def IIC_iStoreiu   : InstrItinClass;
-def IIC_iStoreru   : InstrItinClass;
-def IIC_iStoresiu  : InstrItinClass;
-def IIC_iStorem    : InstrItinClass;
+def IIC_iLoad_i    : InstrItinClass;
+def IIC_iLoad_r    : InstrItinClass;
+def IIC_iLoad_si   : InstrItinClass;
+def IIC_iLoad_iu   : InstrItinClass;
+def IIC_iLoad_ru   : InstrItinClass;
+def IIC_iLoad_siu  : InstrItinClass;
+def IIC_iLoad_bh_i   : InstrItinClass;
+def IIC_iLoad_bh_r   : InstrItinClass;
+def IIC_iLoad_bh_si  : InstrItinClass;
+def IIC_iLoad_bh_iu  : InstrItinClass;
+def IIC_iLoad_bh_ru  : InstrItinClass;
+def IIC_iLoad_bh_siu : InstrItinClass;
+def IIC_iLoad_d_i  : InstrItinClass;
+def IIC_iLoad_d_r  : InstrItinClass;
+def IIC_iLoad_d_ru : InstrItinClass;
+def IIC_iLoad_m    : InstrItinClass<0>;  // micro-coded
+def IIC_iLoad_mu   : InstrItinClass<0>;  // micro-coded
+def IIC_iLoad_mBr  : InstrItinClass<0>;  // micro-coded
+def IIC_iPop       : InstrItinClass<0>;  // micro-coded
+def IIC_iPop_Br    : InstrItinClass<0>;  // micro-coded
+def IIC_iLoadiALU  : InstrItinClass;
+def IIC_iStore_i   : InstrItinClass;
+def IIC_iStore_r   : InstrItinClass;
+def IIC_iStore_si  : InstrItinClass;
+def IIC_iStore_iu  : InstrItinClass;
+def IIC_iStore_ru  : InstrItinClass;
+def IIC_iStore_siu : InstrItinClass;
+def IIC_iStore_bh_i   : InstrItinClass;
+def IIC_iStore_bh_r   : InstrItinClass;
+def IIC_iStore_bh_si  : InstrItinClass;
+def IIC_iStore_bh_iu  : InstrItinClass;
+def IIC_iStore_bh_ru  : InstrItinClass;
+def IIC_iStore_bh_siu : InstrItinClass;
+def IIC_iStore_d_i   : InstrItinClass;
+def IIC_iStore_d_r   : InstrItinClass;
+def IIC_iStore_d_ru  : InstrItinClass;
+def IIC_iStore_m   : InstrItinClass<0>;  // micro-coded
+def IIC_iStore_mu  : InstrItinClass<0>;  // micro-coded
+def IIC_Preload    : InstrItinClass;
 def IIC_Br         : InstrItinClass;
 def IIC_fpSTAT     : InstrItinClass;
 def IIC_fpUNA32    : InstrItinClass;
@@ -80,19 +124,76 @@ def IIC_fpSQRT32   : InstrItinClass;
 def IIC_fpSQRT64   : InstrItinClass;
 def IIC_fpLoad32   : InstrItinClass;
 def IIC_fpLoad64   : InstrItinClass;
-def IIC_fpLoadm    : InstrItinClass;
+def IIC_fpLoad_m   : InstrItinClass<0>;  // micro-coded
+def IIC_fpLoad_mu  : InstrItinClass<0>;  // micro-coded
 def IIC_fpStore32  : InstrItinClass;
 def IIC_fpStore64  : InstrItinClass;
-def IIC_fpStorem   : InstrItinClass;
+def IIC_fpStore_m  : InstrItinClass<0>;  // micro-coded
+def IIC_fpStore_mu : InstrItinClass<0>;  // micro-coded
 def IIC_VLD1       : InstrItinClass;
+def IIC_VLD1x2     : InstrItinClass;
+def IIC_VLD1x3     : InstrItinClass;
+def IIC_VLD1x4     : InstrItinClass;
+def IIC_VLD1u      : InstrItinClass;
+def IIC_VLD1x2u    : InstrItinClass;
+def IIC_VLD1x3u    : InstrItinClass;
+def IIC_VLD1x4u    : InstrItinClass;
+def IIC_VLD1ln     : InstrItinClass;
+def IIC_VLD1lnu    : InstrItinClass;
+def IIC_VLD1dup    : InstrItinClass;
+def IIC_VLD1dupu   : InstrItinClass;
 def IIC_VLD2       : InstrItinClass;
+def IIC_VLD2x2     : InstrItinClass;
+def IIC_VLD2u      : InstrItinClass;
+def IIC_VLD2x2u    : InstrItinClass;
+def IIC_VLD2ln     : InstrItinClass;
+def IIC_VLD2lnu    : InstrItinClass;
+def IIC_VLD2dup    : InstrItinClass;
+def IIC_VLD2dupu   : InstrItinClass;
 def IIC_VLD3       : InstrItinClass;
+def IIC_VLD3ln     : InstrItinClass;
+def IIC_VLD3u      : InstrItinClass;
+def IIC_VLD3lnu    : InstrItinClass;
+def IIC_VLD3dup    : InstrItinClass;
+def IIC_VLD3dupu   : InstrItinClass;
 def IIC_VLD4       : InstrItinClass;
-def IIC_VST        : InstrItinClass;
+def IIC_VLD4ln     : InstrItinClass;
+def IIC_VLD4u      : InstrItinClass;
+def IIC_VLD4lnu    : InstrItinClass;
+def IIC_VLD4dup    : InstrItinClass;
+def IIC_VLD4dupu   : InstrItinClass;
+def IIC_VST1       : InstrItinClass;
+def IIC_VST1x2     : InstrItinClass;
+def IIC_VST1x3     : InstrItinClass;
+def IIC_VST1x4     : InstrItinClass;
+def IIC_VST1u      : InstrItinClass;
+def IIC_VST1x2u    : InstrItinClass;
+def IIC_VST1x3u    : InstrItinClass;
+def IIC_VST1x4u    : InstrItinClass;
+def IIC_VST1ln     : InstrItinClass;
+def IIC_VST1lnu    : InstrItinClass;
+def IIC_VST2       : InstrItinClass;
+def IIC_VST2x2     : InstrItinClass;
+def IIC_VST2u      : InstrItinClass;
+def IIC_VST2x2u    : InstrItinClass;
+def IIC_VST2ln     : InstrItinClass;
+def IIC_VST2lnu    : InstrItinClass;
+def IIC_VST3       : InstrItinClass;
+def IIC_VST3u      : InstrItinClass;
+def IIC_VST3ln     : InstrItinClass;
+def IIC_VST3lnu    : InstrItinClass;
+def IIC_VST4       : InstrItinClass;
+def IIC_VST4u      : InstrItinClass;
+def IIC_VST4ln     : InstrItinClass;
+def IIC_VST4lnu    : InstrItinClass;
 def IIC_VUNAD      : InstrItinClass;
 def IIC_VUNAQ      : InstrItinClass;
 def IIC_VBIND      : InstrItinClass;
 def IIC_VBINQ      : InstrItinClass;
+def IIC_VPBIND     : InstrItinClass;
+def IIC_VFMULD     : InstrItinClass;
+def IIC_VFMULQ     : InstrItinClass;
+def IIC_VMOV       : InstrItinClass;
 def IIC_VMOVImm    : InstrItinClass;
 def IIC_VMOVD      : InstrItinClass;
 def IIC_VMOVQ      : InstrItinClass;
@@ -101,6 +202,7 @@ def IIC_VMOVID     : InstrItinClass;
 def IIC_VMOVISL    : InstrItinClass;
 def IIC_VMOVSI     : InstrItinClass;
 def IIC_VMOVDI     : InstrItinClass;
+def IIC_VMOVN      : InstrItinClass;
 def IIC_VPERMD     : InstrItinClass;
 def IIC_VPERMQ     : InstrItinClass;
 def IIC_VPERMQ3    : InstrItinClass;
@@ -152,7 +254,7 @@ def IIC_VTBX4      : InstrItinClass;
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
 
-def GenericItineraries : ProcessorItineraries<[], []>;
+def GenericItineraries : ProcessorItineraries<[], [], []>;
 
 include "ARMScheduleV6.td"
 include "ARMScheduleA8.td"
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
index 282abca..8d86c01 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
@@ -14,18 +14,17 @@
 //
 // Scheduling information derived from "Cortex-A8 Technical Reference Manual".
 // Functional Units.
-def A8_Issue   : FuncUnit; // issue
 def A8_Pipe0   : FuncUnit; // pipeline 0
 def A8_Pipe1   : FuncUnit; // pipeline 1
-def A8_LdSt0   : FuncUnit; // pipeline 0 load/store
-def A8_LdSt1   : FuncUnit; // pipeline 1 load/store
+def A8_LSPipe  : FuncUnit; // Load / store pipeline
 def A8_NPipe   : FuncUnit; // NEON ALU/MUL pipe
 def A8_NLSPipe : FuncUnit; // NEON LS pipe
 //
 // Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1
 //
 def CortexA8Itineraries : ProcessorItineraries<
-  [A8_Issue, A8_Pipe0, A8_Pipe1, A8_LdSt0, A8_LdSt1, A8_NPipe, A8_NLSPipe], [
+  [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe],
+  [], [
   // Two fully-pipelined integer ALU pipelines
   //
   // No operand cycles
@@ -35,12 +34,23 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
   InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
   InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>,
   InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
   //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  //
   // Unary Instructions that produce a result
   InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
   InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iUNAsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>,
   //
   // Compare instructions
   InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
@@ -48,124 +58,184 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
   InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
   //
+  // Test instructions
+  InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
   // Move instructions, unconditional
   InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
   InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
   InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
   InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                             InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_LSPipe]>], [5]>,
   //
   // Move instructions, conditional
   InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
   InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
   InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
   InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                              InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
 
   // Integer multiply pipeline
   // Result written in E5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
   //
   InstrItinData<IIC_iMUL16   , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>,
-  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A8_Pipe1], 0>,
-                                InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
-  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A8_Pipe1], 0>,
-                                InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
-  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A8_Pipe1], 0>,
-                                InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
-  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A8_Pipe1], 0>,
-                                InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
-  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A8_Pipe1], 0>,
-                                InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
 
   // Integer load pipeline
   //
-  // loads have an extra cycle of latency, but are fully pipelined
-  // use A8_Issue to enforce the 1 load/store per cycle limit
-  //
   // Immediate offset
-  InstrItinData<IIC_iLoadi   , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_d_i,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
   //
   // Register offset
-  InstrItinData<IIC_iLoadr   , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
   //
   // Scaled register offset, issues over 2 cycles
-  InstrItinData<IIC_iLoadsi  , [InstrStage<2, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0], 0>,
-                                InstrStage<1, [A8_Pipe1]>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [4, 1, 1]>,
+  // FIXME: lsl by 2 takes 1 cycle.
+  InstrItinData<IIC_iLoad_si  , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
   //
   // Immediate offset with update
-  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 2, 1]>,
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
   //
   // Register offset with update
-  InstrItinData<IIC_iLoadru  , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
   //
   // Scaled register offset with update, issues over 2 cycles
-  InstrItinData<IIC_iLoadsiu , [InstrStage<2, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0], 0>,
-                                InstrStage<1, [A8_Pipe1]>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [4, 3, 1, 1]>,
-  //
-  // Load multiple
-  InstrItinData<IIC_iLoadm   , [InstrStage<2, [A8_Issue], 0>,
-                                InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_Pipe1]>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>]>,
+  InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple, def is the 5th operand. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>], [1, 1, 1, 1, 3]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>], [2, 1, 1, 1, 3]>,
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                               [1, 2, 1, 1, 3]>,
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>], [1, 1, 3]>,
+  //
+  // Push, def is the 3th operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                               [1, 1, 3]>,
 
-  // Integer store pipeline
   //
-  // use A8_Issue to enforce the 1 load/store per cycle limit
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>,
+
+
+  // Integer store pipeline
   //
   // Immediate offset
-  InstrItinData<IIC_iStorei  , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 1]>,
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
   //
   // Register offset
-  InstrItinData<IIC_iStorer  , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
   //
   // Scaled register offset, issues over 2 cycles
-  InstrItinData<IIC_iStoresi , [InstrStage<2, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0], 0>,
-                                InstrStage<1, [A8_Pipe1]>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
   //
   // Immediate offset with update
-  InstrItinData<IIC_iStoreiu , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [2, 3, 1]>,
+  InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
   //
   // Register offset with update
-  InstrItinData<IIC_iStoreru  , [InstrStage<1, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
   //
   // Scaled register offset with update, issues over 2 cycles
-  InstrItinData<IIC_iStoresiu, [InstrStage<2, [A8_Issue], 0>,
-                                InstrStage<1, [A8_Pipe0], 0>,
-                                InstrStage<1, [A8_Pipe1]>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>], [3, 3, 1, 1]>,
-  //
-  // Store multiple
-  InstrItinData<IIC_iStorem  , [InstrStage<2, [A8_Issue], 0>,
-                                InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_Pipe1]>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>]>,
+  InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                   InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>], [2]>,
+
+  //
+  // Preload
+  InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
 
   // Branch
   //
@@ -178,440 +248,786 @@ def CortexA8Itineraries : ProcessorItineraries<
   // possible.
   //
   // FP Special Register to Integer Register File Move
-  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                              InstrStage<1, [A8_NLSPipe]>]>,
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                              InstrStage<1, [A8_NLSPipe]>], [20]>,
   //
   // Single-precision FP Unary
-  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [7, 1]>,
   //
   // Double-precision FP Unary
-  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<4, [A8_NPipe], 0>,
                                InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
   //
   // Single-precision FP Compare
-  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [1, 1]>,
   //
   // Double-precision FP Compare
-  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<4, [A8_NPipe], 0>,
                                InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
   //
   // Single to Double FP Convert
-  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<7, [A8_NPipe], 0>,
                                InstrStage<7, [A8_NLSPipe]>], [7, 1]>,
   //
   // Double to Single FP Convert
-  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<5, [A8_NPipe], 0>,
                                InstrStage<5, [A8_NLSPipe]>], [5, 1]>,
   //
   // Single-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [7, 1]>,
   //
   // Double-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<8, [A8_NPipe], 0>,
                                InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
   //
   // Integer to Single-Precision FP Convert
-  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [7, 1]>,
   //
   // Integer to Double-Precision FP Convert
-  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<8, [A8_NPipe], 0>,
                                InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
   //
   // Single-precision FP ALU
-  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
   //
   // Double-precision FP ALU
-  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<9, [A8_NPipe], 0>,
                                InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>,
   //
   // Single-precision FP Multiply
-  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
   //
   // Double-precision FP Multiply
-  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<11, [A8_NPipe], 0>,
                                InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>,
   //
   // Single-precision FP MAC
-  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
   //
   // Double-precision FP MAC
-  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<19, [A8_NPipe], 0>,
                                InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
   //
   // Single-precision FP DIV
-  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<20, [A8_NPipe], 0>,
                                InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>,
   //
   // Double-precision FP DIV
-  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<29, [A8_NPipe], 0>,
                                InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>,
   //
   // Single-precision FP SQRT
-  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<19, [A8_NPipe], 0>,
                                InstrStage<19, [A8_NLSPipe]>], [19, 1]>,
   //
   // Double-precision FP SQRT
-  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<29, [A8_NPipe], 0>,
                                InstrStage<29, [A8_NLSPipe]>], [29, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 20, 1]>,
+
   //
   // Single-precision FP Load
-  // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
   //
   // Double-precision FP Load
-  // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0], 0>,
-                               InstrStage<1, [A8_Pipe1]>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
   //
   // FP Load Multiple
-  // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [A8_Issue], 0>,
-                               InstrStage<2, [A8_Pipe0], 0>,
-                               InstrStage<2, [A8_Pipe1]>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 2]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 2]>,
   //
   // Single-precision FP Store
-  // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
   //
   // Double-precision FP Store
-  // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0], 0>,
-                               InstrStage<1, [A8_Pipe1]>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
   //
   // FP Store Multiple
-  // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>,
-                               InstrStage<2, [A8_Pipe0], 0>,
-                               InstrStage<2, [A8_Pipe1]>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 1]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 1]>,
 
   // NEON
   // Issue through integer pipeline, and execute in NEON unit.
   //
   // VLD1
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1]>,
+  //
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1]>,
+  //
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1, 1]>,
   //
   // VLD2
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1]>,
   //
   // VLD3
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 1]>,
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1, 1]>,
   //
   // VLD4
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 2, 1]>,
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
   //
-  // VST
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VST,      [InstrStage<1, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
   //
   // Double-register FP Unary
-  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [5, 2]>,
   //
   // Quad-register FP Unary
   // Result written in N5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
-  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [6, 2]>,
   //
   // Double-register FP Binary
-  InstrItinData<IIC_VBIND,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
   //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 1]>,
+
+  //
   // Quad-register FP Binary
   // Result written in N5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
-  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [6, 2, 2]>,
   //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 1]>,
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
+  //
   // Move Immediate
-  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [3]>,
   //
   // Double-register Permute Move
-  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
   //
   // Quad-register Permute Move
   // Result written in N2, but that is relative to the last cycle of multicycle,
   // so we use 3 for those cases
-  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 1]>,
   //
   // Integer to Single-precision Move
-  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
   //
   // Integer to Double-precision Move
-  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
   //
   // Single-precision to Integer Move
-  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [20, 1]>,
   //
   // Double-precision to Integer Move
-  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>,
   //
   // Integer to Lane Move
-  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
   //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [2, 1]>,
+  //
   // Double-register Permute
-  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>,
   //
   // Quad-register Permute
   // Result written in N2, but that is relative to the last cycle of multicycle,
   // so we use 3 for those cases
-  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>,
   //
   // Quad-register Permute (3 cycle issue)
   // Result written in N2, but that is relative to the last cycle of multicycle,
   // so we use 4 for those cases
-  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>,
   //
   // Double-register FP Multiple-Accumulate
-  InstrItinData<IIC_VMACD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
   //
   // Quad-register FP Multiple-Accumulate
   // Result written in N9, but that is relative to the last cycle of multicycle,
   // so we use 10 for those cases
-  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
   //
   // Double-register Reciprical Step
-  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
   //
   // Quad-register Reciprical Step
-  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [10, 2, 2]>,
   //
   // Double-register Integer Count
-  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
   //
   // Quad-register Integer Count
   // Result written in N3, but that is relative to the last cycle of multicycle,
   // so we use 4 for those cases
-  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [4, 2, 2]>,
   //
   // Double-register Integer Unary
-  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 2]>,
   //
   // Quad-register Integer Unary
-  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 2]>,
   //
   // Double-register Integer Q-Unary
-  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 1]>,
   //
   // Quad-register Integer CountQ-Unary
-  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 1]>,
   //
   // Double-register Integer Binary
-  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
   //
   // Quad-register Integer Binary
-  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
   //
   // Double-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
   //
   // Quad-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
 
   //
   // Double-register Integer Subtract
-  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
   //
   // Quad-register Integer Subtract
-  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
   //
   // Double-register Integer Subtract
-  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
   //
   // Quad-register Integer Subtract
-  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
   //
   // Double-register Integer Shift
-  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [3, 1, 1]>,
   //
   // Quad-register Integer Shift
-  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [4, 1, 1]>,
   //
   // Double-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4D,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSHLi4D,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [4, 1, 1]>,
   //
   // Quad-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4Q,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VSHLi4Q,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [5, 1, 1]>,
   //
   // Double-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [6, 3, 1]>,
   //
   // Quad-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [7, 3, 1]>,
   //
   // Double-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>,
   //
   // Quad-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>,
 
   //
   // Double-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [6, 2, 2]>,
   //
   // Double-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [7, 2, 1]>,
   //
   // Quad-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [7, 2, 2]>,
   //
   // Quad-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>,
                                InstrStage<2, [A8_NLSPipe], 0>,
                                InstrStage<3, [A8_NPipe]>], [9, 2, 1]>,
   //
   // Double-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>,
   //
   // Double-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>,
   //
   // Quad-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>,
   //
   // Quad-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>,
                                InstrStage<2, [A8_NLSPipe], 0>,
                                InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>,
   //
   // Double-register VEXT
-  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
   //
   // Quad-register VEXT
-  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
   //
   // VTB
-  InstrItinData<IIC_VTB1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>,
-  InstrItinData<IIC_VTB2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>,
-  InstrItinData<IIC_VTB3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>,
-  InstrItinData<IIC_VTB4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
                                InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>,
   //
   // VTBX
-  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>,
-  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>,
-  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
                                InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>,
-  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
                             InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
index df2f896..82c6735 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -16,130 +16,417 @@
 // Reference Manual".
 //
 // Functional units
-def A9_Pipe0   : FuncUnit; // pipeline 0
-def A9_Pipe1   : FuncUnit; // pipeline 1
-def A9_LSPipe  : FuncUnit; // LS pipe
-def A9_NPipe   : FuncUnit; // NEON ALU/MUL pipe
+def A9_Issue0  : FuncUnit; // Issue 0
+def A9_Issue1  : FuncUnit; // Issue 1
+def A9_Branch  : FuncUnit; // Branch
+def A9_ALU0    : FuncUnit; // ALU / MUL pipeline 0
+def A9_ALU1    : FuncUnit; // ALU pipeline 1
+def A9_AGU     : FuncUnit; // Address generation unit for ld / st
+def A9_NPipe   : FuncUnit; // NEON pipeline
+def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
+def A9_LSUnit  : FuncUnit; // L/S Unit
 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
 def A9_DRegsN  : FuncUnit; // FP register set, NEON side
 
-// Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
-//
+// Bypasses
+def A9_LdBypass : Bypass;
+
 def CortexA9Itineraries : ProcessorItineraries<
-  [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [
+  [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
+   A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
+  [A9_LdBypass], [
   // Two fully-pipelined integer ALU pipelines
-  // FIXME: There are no operand latencies for these instructions at all!
+
   //
   // Move instructions, unconditional
-  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
-  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMOVsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_AGU], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [5]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                              [2, 1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [3, 1, 1]>,
   //
   // No operand cycles
-  InstrItinData<IIC_iALUx    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
+  InstrItinData<IIC_iALUx   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>]>,
   //
   // Binary Instructions that produce a result
-  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2, 2]>,
-  InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
-  InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                            [3, 1, 1, 1],
+                            [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
   //
   // Unary Instructions that produce a result
-  InstrItinData<IIC_iUNAr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iUNAsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iUNAsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+
+  // CLZ, RBIT, etc.
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+
+  // BFC, BFI, UBFX, SBFX
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
   //
   // Compare instructions
-  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
-  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMPsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1], [A9_LdBypass]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1, 1], [A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                                [1, 1], [A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iTSTr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
   //
   // Move instructions, conditional
-  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
-  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+  // FIXME: Correctly model the extra input dep on the destination.
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
 
   // Integer multiply pipeline
   //
-  InstrItinData<IIC_iMUL16   , [InstrStage<1, [A9_Pipe1], 0>,
-                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
-  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A9_Pipe1], 0>,
-                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
-  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A9_Pipe1], 0>,
-                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
-  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A9_Pipe1], 0>,
-                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
-  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A9_Pipe1], 0>,
-                                InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
-  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A9_Pipe1], 0>,
-                                InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
+  InstrItinData<IIC_iMUL16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [3, 1, 1]>,
+  InstrItinData<IIC_iMAC16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [3, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [4, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>],
+                              [4, 5, 1, 1]>,
   // Integer load pipeline
   // FIXME: The timings are some rough approximations
   //
   // Immediate offset
-  InstrItinData<IIC_iLoadi   , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1], [A9_LdBypass]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1], [A9_LdBypass]>,
   //
   // Register offset
-  InstrItinData<IIC_iLoadr   , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
   //
   // Scaled register offset
-  InstrItinData<IIC_iLoadsi  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit], 0>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [5, 1, 1], [A9_LdBypass]>,
   //
   // Immediate offset with update
-  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1], [A9_LdBypass]>,
   //
   // Register offset with update
-  InstrItinData<IIC_iLoadru  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
   //
   // Scaled register offset with update
-  InstrItinData<IIC_iLoadsiu , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [5, 4, 1, 1], [A9_LdBypass]>,
+  //
+  // Load multiple, def is the 5th operand.
+  // FIXME: This assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [2, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 2, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass]>,
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass]>,
+
   //
-  // Load multiple
-  InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>]>,
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<1, [A9_LSUnit]>,
+                                InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [2, 1]>,
 
   // Integer store pipeline
   ///
   // Immediate offset
-  InstrItinData<IIC_iStorei  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
   //
   // Register offset
-  InstrItinData<IIC_iStorer  , [InstrStage<1, [ A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
   //
   // Scaled register offset
-  InstrItinData<IIC_iStoresi , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
   //
   // Immediate offset with update
-  InstrItinData<IIC_iStoreiu , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [2, 3, 1]>,
+  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
   //
   // Register offset with update
-  InstrItinData<IIC_iStoreru , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
   //
   // Scaled register offset with update
-  InstrItinData<IIC_iStoresiu, [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<2, [A9_LSPipe]>], [3, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<1, [A9_AGU], 0>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<2, [A9_AGU], 1>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [3, 1, 1, 1]>,
   //
   // Store multiple
-  InstrItinData<IIC_iStorem  , [InstrStage<1, [A9_Pipe1]>,
-                                InstrStage<1, [A9_LSPipe]>]>,
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>], [2]>,
+
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
+
   // Branch
   //
   // no delay slots, so the latency of a branch is unimportant
-  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
+  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Issue0], 0>,
+                                InstrStage<1, [A9_Issue1], 0>,
+                                InstrStage<1, [A9_Branch]>]>,
 
   // VFP and NEON shares the same register file. This means that every VFP
   // instruction should wait for full completion of the consecutive NEON
@@ -159,687 +446,1379 @@ def CortexA9Itineraries : ProcessorItineraries<
   // Issue through integer pipeline, and execute in NEON unit.
 
   // FP Special Register to Integer Register File Move
-  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                              InstrStage<1, [A9_MUX0], 0>,
+                              InstrStage<1, [A9_DRegsVFP], 0, Required>,
                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                              InstrStage<1, [A9_Pipe1]>,
-                              InstrStage<1, [A9_NPipe]>]>,
+                              InstrStage<1, [A9_NPipe]>],
+                             [1]>,
   //
   // Single-precision FP Unary
-  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
   //
   // Double-precision FP Unary
-  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
 
   //
   // Single-precision FP Compare
-  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
   //
   // Double-precision FP Compare
-  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
   //
   // Single to Double FP Convert
-  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Double to Single FP Convert
-  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
 
   //
   // Single to Half FP Convert
-  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Half to Single FP Convert
-  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
 
   //
   // Single-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Double-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Integer to Single-Precision FP Convert
-  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Integer to Double-Precision FP Convert
-  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Single-precision FP ALU
-  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
   //
   // Double-precision FP ALU
-  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
   //
   // Single-precision FP Multiply
-  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<6, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [5, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
   //
   // Double-precision FP Multiply
-  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<7, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [6, 1, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 1, 1]>,
   //
   // Single-precision FP MAC
-  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<9, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
   //
   // Double-precision FP MAC
-  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<10, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe1]>,
-                               InstrStage<2,  [A9_NPipe]>], [9, 0, 1, 1]>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
   //
   // Single-precision FP DIV
-  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<16, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe1]>,
-                               InstrStage<10, [A9_NPipe]>], [15, 1, 1]>,
+                               InstrStage<10, [A9_NPipe]>],
+                              [15, 1, 1]>,
   //
   // Double-precision FP DIV
-  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<26, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe1]>,
-                               InstrStage<20, [A9_NPipe]>], [25, 1, 1]>,
+                               InstrStage<20, [A9_NPipe]>],
+                              [25, 1, 1]>,
   //
   // Single-precision FP SQRT
-  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<18, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,  [A9_Pipe1]>,
-                               InstrStage<13, [A9_NPipe]>], [17, 1]>,
+                               InstrStage<13, [A9_NPipe]>],
+                              [17, 1]>,
   //
   // Double-precision FP SQRT
-  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<33, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,  [A9_Pipe1]>,
-                               InstrStage<28, [A9_NPipe]>], [32, 1]>,
+                               InstrStage<28, [A9_NPipe]>],
+                              [32, 1]>,
 
   //
   // Integer to Single-precision Move
-  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
   //
   // Integer to Double-precision Move
-  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
   //
   // Single-precision to Integer Move
-  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
   //
   // Double-precision to Integer Move
-  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1, 1]>,
   //
   // Single-precision FP Load
-  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
   //
   // Double-precision FP Load
-  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  // FIXME: Result latency is 1 if address is 64-bit aligned.
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
   //
   // FP Load Multiple
-  InstrItinData<IIC_fpLoadm,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>,
   //
   // Single-precision FP Store
-  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
   //
   // Double-precision FP Store
-  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
   //
   // FP Store Multiple
-  InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                                InstrStage<1, [A9_NPipe], 0>,
+                                InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>,
   // NEON
-  // Issue through integer pipeline, and execute in NEON unit.
-  // FIXME: Neon pipeline and LdSt unit are multiplexed.
-  //        Add some syntactic sugar to model this!
   // VLD1
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+  // FIXME: Conservatively assume insufficent alignment.
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 2, 3, 1]>,
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 1]>,
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 2, 1]>,
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 2, 3, 2, 1]>,
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [4, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [4, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 2, 1, 1]>,
   //
   // VLD2
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 4, 3, 4, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [4, 4, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 4, 3, 4, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [4, 4, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1]>,
   //
   // VLD3
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>,
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1, 1]>,
   //
   // VLD4
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>,
-  //
-  // VST
-  // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VST,      [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 6 cycles
-                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1], 0>,
-                               InstrStage<1, [A9_LSPipe]>,
-                               InstrStage<1, [A9_NPipe]>]>,
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 6, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+
   //
   // Double-register Integer Unary
-  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
   //
   // Quad-register Integer Unary
-  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
   //
   // Double-register Integer Q-Unary
-  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Quad-register Integer CountQ-Unary
-  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
   //
   // Double-register Integer Binary
-  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
   //
   // Quad-register Integer Binary
-  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
   //
   // Double-register Integer Subtract
-  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
   //
   // Quad-register Integer Subtract
-  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
   //
   // Double-register Integer Shift
-  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
   //
   // Quad-register Integer Shift
-  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
   //
   // Double-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
   //
   // Quad-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
   //
   // Double-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
   //
   // Quad-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
   //
   // Double-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
   //
   // Quad-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
 
   //
   // Double-register Integer Count
-  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
   //
   // Quad-register Integer Count
   // Result written in N3, but that is relative to the last cycle of multicycle,
   // so we use 4 for those cases
-  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [4, 2, 2]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 2, 2]>,
   //
   // Double-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
   //
   // Quad-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
   //
   // Double-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [6, 3, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 1]>,
   //
   // Quad-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [6, 3, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 1]>,
 
   //
   // Double-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [6, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 2]>,
   //
   // Quad-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [7, 2, 2]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 2]>,
 
   //
   // Double-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [7, 2, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 1]>,
   //
   // Quad-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<4, [A9_NPipe]>], [9, 2, 1]>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 2, 1]>,
   //
   // Double-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 2]>,
   //
   // Double-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 1]>,
   //
   // Quad-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 2]>,
   //
   // Quad-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 3, 2, 1]>,
+
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1,1]>,
   //
   // Move Immediate
-  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [3]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3]>,
   //
   // Double-register Permute Move
-  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe]>], [2, 1]>,
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
   //
   // Quad-register Permute Move
-  // Result written in N2, but that is relative to the last cycle of multicycle,
-  // so we use 3 for those cases
-  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
-                               InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 1]>,
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
   //
   // Integer to Single-precision Move
-  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
   //
   // Integer to Double-precision Move
-  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
   //
   // Single-precision to Integer Move
-  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
   //
   // Double-precision to Integer Move
-  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1]>,
   //
   // Integer to Lane Move
-  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN],   0, Required>,
-  // FIXME: all latencies are arbitrary, no information is available
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 1]>,
 
   //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1]>,
+  //
   // Double-register FP Unary
-  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [5, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2]>,
   //
   // Quad-register FP Unary
   // Result written in N5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
-  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [6, 2]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2]>,
   //
   // Double-register FP Binary
   // FIXME: We're using this itin for many instructions and [2, 2] here is too
   // optimistic.
-  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [5, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 2]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 1]>,
   //
   // Quad-register FP Binary
   // Result written in N5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
   // FIXME: We're using this itin for many instructions and [2, 2] here is too
   // optimistic.
-  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 8 cycles
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2, 2]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 1]>,
   //
   // Double-register FP Multiple-Accumulate
-  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
   //
   // Quad-register FP Multiple-Accumulate
   // Result written in N9, but that is relative to the last cycle of multicycle,
   // so we use 10 for those cases
-  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
   //
   // Double-register Reciprical Step
-  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 10 cycles
+                               InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [9, 2, 2]>,
   //
   // Quad-register Reciprical Step
-  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 9 cycles
-                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<4, [A9_NPipe]>], [8, 2, 2]>,
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 11 cycles
+                               InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [10, 2, 2]>,
   //
   // Double-register Permute
-  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1, 1]>,
   //
   // Quad-register Permute
   // Result written in N2, but that is relative to the last cycle of multicycle,
   // so we use 3 for those cases
-  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 3, 1, 1]>,
   //
   // Quad-register Permute (3 cycle issue)
   // Result written in N2, but that is relative to the last cycle of multicycle,
   // so we use 4 for those cases
-  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 4, 1, 1]>,
 
   //
   // Double-register VEXT
-  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 7 cycles
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1, 1]>,
   //
   // Quad-register VEXT
-  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
-                               // Extra latency cycles since wbck is 9 cycles
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2]>,
   //
   // VTB
-  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 2, 1]>,
-  InstrItinData<IIC_VTB2,     [InstrStage<2, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>,
-  InstrItinData<IIC_VTB3,     [InstrStage<2, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>,
-  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 3, 1]>,
   //
   // VTBX
-  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>,
-  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>,
-  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                               InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>,
-  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe1]>,
-                              InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 3, 1]>
 ]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
index 08b560c..c1880a7 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
@@ -19,7 +19,7 @@ def V6_Pipe : FuncUnit; // pipeline
 // Scheduling information derived from "ARM1176JZF-S Technical Reference Manual"
 //
 def ARMV6Itineraries : ProcessorItineraries<
-  [V6_Pipe], [
+  [V6_Pipe], [], [
   //
   // No operand cycles
   InstrItinData<IIC_iALUx    , [InstrStage<1, [V6_Pipe]>]>,
@@ -30,10 +30,20 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iALUsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
   InstrItinData<IIC_iALUsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
   //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iBITr    , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
   // Unary Instructions that produce a result
   InstrItinData<IIC_iUNAr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
   InstrItinData<IIC_iUNAsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
-  InstrItinData<IIC_iUNAsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr    , [InstrStage<1, [V6_Pipe]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr  , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
   //
   // Compare instructions
   InstrItinData<IIC_iCMPi    , [InstrStage<1, [V6_Pipe]>], [2]>,
@@ -41,17 +51,39 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iCMPsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
   InstrItinData<IIC_iCMPsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
   //
+  // Test instructions
+  InstrItinData<IIC_iTSTi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iTSTr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
   // Move instructions, unconditional
   InstrItinData<IIC_iMOVi    , [InstrStage<1, [V6_Pipe]>], [2]>,
   InstrItinData<IIC_iMOVr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
   InstrItinData<IIC_iMOVsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
   InstrItinData<IIC_iMOVsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iMOVix2  , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>], [5]>,
   //
   // Move instructions, conditional
   InstrItinData<IIC_iCMOVi   , [InstrStage<1, [V6_Pipe]>], [3]>,
   InstrItinData<IIC_iCMOVr   , [InstrStage<1, [V6_Pipe]>], [3, 2]>,
   InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [V6_Pipe]>], [3, 1]>,
   InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [4]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMVNr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iMVNsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iMVNsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
 
   // Integer multiply pipeline
   //
@@ -65,50 +97,90 @@ def ARMV6Itineraries : ProcessorItineraries<
   // Integer load pipeline
   //
   // Immediate offset
-  InstrItinData<IIC_iLoadi   , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
   //
   // Register offset
-  InstrItinData<IIC_iLoadr   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
   //
   // Scaled register offset, issues over 2 cycles
-  InstrItinData<IIC_iLoadsi  , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  InstrItinData<IIC_iLoad_si   , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
   //
   // Immediate offset with update
-  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iLoad_iu   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
   //
   // Register offset with update
-  InstrItinData<IIC_iLoadru  , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_ru   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
   //
   // Scaled register offset with update, issues over 2 cycles
-  InstrItinData<IIC_iLoadsiu , [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+  InstrItinData<IIC_iLoad_siu,   [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+
+  //
+  // Load multiple, def is the 5th operand.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>,
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>,
+
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [3, 1]>,
 
   //
-  // Load multiple
-  InstrItinData<IIC_iLoadm   , [InstrStage<3, [V6_Pipe]>]>,
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop     , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>,
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 4]>,
 
   // Integer store pipeline
   //
   // Immediate offset
-  InstrItinData<IIC_iStorei  , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_i   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
   //
   // Register offset
-  InstrItinData<IIC_iStorer  , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
-
+  InstrItinData<IIC_iStore_r   , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
   //
   // Scaled register offset, issues over 2 cycles
-  InstrItinData<IIC_iStoresi , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_si   , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
   //
   // Immediate offset with update
-  InstrItinData<IIC_iStoreiu , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_iu   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
   //
   // Register offset with update
-  InstrItinData<IIC_iStoreru , [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_ru,   [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
   //
   // Scaled register offset with update, issues over 2 cycles
-  InstrItinData<IIC_iStoresiu, [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  InstrItinData<IIC_iStore_siu,   [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
   //
   // Store multiple
-  InstrItinData<IIC_iStorem   , [InstrStage<3, [V6_Pipe]>]>,
+  InstrItinData<IIC_iStore_m  , [InstrStage<3, [V6_Pipe]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
   
   // Branch
   //
@@ -183,6 +255,18 @@ def ARMV6Itineraries : ProcessorItineraries<
   // Double-precision FP SQRT
   InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
   //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>,
+  //
   // Single-precision FP Load
   InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
   //
@@ -190,7 +274,10 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
   //
   // FP Load Multiple
-  InstrItinData<IIC_fpLoadm , [InstrStage<3, [V6_Pipe]>]>,
+  InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>,
   //
   // Single-precision FP Store
   InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
@@ -200,5 +287,8 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
   //
   // FP Store Multiple
-  InstrItinData<IIC_fpStorem , [InstrStage<3, [V6_Pipe]>]>
+  InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]>
 ]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index a289407..2b9202b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -29,10 +29,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
                                              SDValue Dst, SDValue Src,
                                              SDValue Size, unsigned Align,
                                              bool isVolatile, bool AlwaysInline,
-                                             const Value *DstSV,
-                                             uint64_t DstSVOff,
-                                             const Value *SrcSV,
-                                             uint64_t SrcSVOff) const {
+                                             MachinePointerInfo DstPtrInfo,
+                                          MachinePointerInfo SrcPtrInfo) const {
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if ((Align & 3) != 0)
@@ -66,7 +64,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
       Loads[i] = DAG.getLoad(VT, dl, Chain,
                              DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                          DAG.getConstant(SrcOff, MVT::i32)),
-                             SrcSV, SrcSVOff + SrcOff, isVolatile, false, 0);
+                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
+                             false, 0);
       TFOps[i] = Loads[i].getValue(1);
       SrcOff += VTSize;
     }
@@ -77,7 +76,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
       TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                               DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                           DAG.getConstant(DstOff, MVT::i32)),
-                              DstSV, DstSVOff + DstOff, isVolatile, false, 0);
+                              DstPtrInfo.getWithOffset(DstOff),
+                              isVolatile, false, 0);
       DstOff += VTSize;
     }
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
@@ -103,7 +103,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
     Loads[i] = DAG.getLoad(VT, dl, Chain,
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, MVT::i32)),
-                           SrcSV, SrcSVOff + SrcOff, false, false, 0);
+                           SrcPtrInfo.getWithOffset(SrcOff), false, false, 0);
     TFOps[i] = Loads[i].getValue(1);
     ++i;
     SrcOff += VTSize;
@@ -125,7 +125,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                         DAG.getConstant(DstOff, MVT::i32)),
-                            DstSV, DstSVOff + DstOff, false, false, 0);
+                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
     ++i;
     DstOff += VTSize;
     BytesLeft -= VTSize;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index d7d00c2..7533690 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -33,10 +33,8 @@ public:
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
                                   bool isVolatile, bool AlwaysInline,
-                                  const Value *DstSV,
-                                  uint64_t DstSVOff,
-                                  const Value *SrcSV,
-                                  uint64_t SrcSVOff) const;
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
 };
 
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
index cb539f4..0bd740c 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -13,6 +13,7 @@
 
 #include "ARMSubtarget.h"
 #include "ARMGenSubtarget.inc"
+#include "ARMBaseRegisterInfo.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
@@ -24,45 +25,52 @@ ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
 
 static cl::opt<bool>
-UseMOVT("arm-use-movt",
-        cl::init(true), cl::Hidden);
+DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+StrictAlign("arm-strict-align", cl::Hidden,
+            cl::desc("Disallow all unaligned memory accesses"));
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
                            bool isT)
   : ARMArchVersion(V4)
+  , ARMProcFamily(Others)
   , ARMFPUType(None)
   , UseNEONForSinglePrecisionFP(false)
-  , SlowVMLx(false)
+  , SlowFPVMLx(false)
   , SlowFPBrcc(false)
   , IsThumb(isT)
   , ThumbMode(Thumb1)
   , NoARM(false)
   , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
-  , UseMovt(UseMOVT)
+  , UseMovt(false)
   , HasFP16(false)
+  , HasD16(false)
   , HasHardwareDivide(false)
   , HasT2ExtractPack(false)
   , HasDataBarrier(false)
   , Pref32BitThumb(false)
+  , HasMPExtension(false)
   , FPOnlySP(false)
+  , AllowsUnalignedMem(false)
   , stackAlignment(4)
   , CPUString("generic")
-  , TargetType(isELF) // Default to ELF unless otherwise specified.
+  , TargetTriple(TT)
   , TargetABI(ARM_ABI_APCS) {
-  // default to soft float ABI
+  // Default to soft float ABI
   if (FloatABIType == FloatABI::Default)
     FloatABIType = FloatABI::Soft;
 
   // Determine default and user specified characteristics
 
-  // Parse features string.
-  CPUString = ParseSubtargetFeatures(FS, CPUString);
-
   // When no arch is specified either by CPU or by attributes, make the default
   // ARMv4T.
-  if (CPUString == "generic" && (FS.empty() || FS == "generic"))
+  const char *ARMArchFeature = "";
+  if (CPUString == "generic" && (FS.empty() || FS == "generic")) {
     ARMArchVersion = V4T;
+    ARMArchFeature = ",+v4t";
+  }
 
   // Set the boolean corresponding to the current target triple, or the default
   // if one cannot be determined, to true.
@@ -80,47 +88,78 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
     unsigned SubVer = TT[Idx];
     if (SubVer >= '7' && SubVer <= '9') {
       ARMArchVersion = V7A;
-      if (Len >= Idx+2 && TT[Idx+1] == 'm')
+      ARMArchFeature = ",+v7a";
+      if (Len >= Idx+2 && TT[Idx+1] == 'm') {
         ARMArchVersion = V7M;
+        ARMArchFeature = ",+v7m";
+      }
     } else if (SubVer == '6') {
       ARMArchVersion = V6;
-      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
+      ARMArchFeature = ",+v6";
+      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2') {
         ARMArchVersion = V6T2;
+        ARMArchFeature = ",+v6t2";
+      }
     } else if (SubVer == '5') {
       ARMArchVersion = V5T;
-      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
+      ARMArchFeature = ",+v5t";
+      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e') {
         ARMArchVersion = V5TE;
+        ARMArchFeature = ",+v5te";
+      }
     } else if (SubVer == '4') {
-      if (Len >= Idx+2 && TT[Idx+1] == 't')
+      if (Len >= Idx+2 && TT[Idx+1] == 't') {
         ARMArchVersion = V4T;
-      else
+        ARMArchFeature = ",+v4t";
+      } else {
         ARMArchVersion = V4;
+        ARMArchFeature = "";
+      }
     }
   }
 
+  if (TT.find("eabi") != std::string::npos)
+    TargetABI = ARM_ABI_AAPCS;
+
+  // Parse features string.  If the first entry in FS (the CPU) is missing,
+  // insert the architecture feature derived from the target triple.  This is
+  // important for setting features that are implied based on the architecture
+  // version.
+  std::string FSWithArch;
+  if (FS.empty())
+    FSWithArch = std::string(ARMArchFeature);
+  else if (FS.find(',') == 0)
+    FSWithArch = std::string(ARMArchFeature) + FS;
+  else
+    FSWithArch = FS;
+  CPUString = ParseSubtargetFeatures(FSWithArch, CPUString);
+
+  // After parsing Itineraries, set ItinData.IssueWidth.
+  computeIssueWidth();
+
   // Thumb2 implies at least V6T2.
   if (ARMArchVersion >= V6T2)
     ThumbMode = Thumb2;
   else if (ThumbMode >= Thumb2)
     ARMArchVersion = V6T2;
 
-  if (Len >= 10) {
-    if (TT.find("-darwin") != std::string::npos)
-      // arm-darwin
-      TargetType = isDarwin;
-  }
-
-  if (TT.find("eabi") != std::string::npos)
-    TargetABI = ARM_ABI_AAPCS;
-
   if (isAAPCS_ABI())
     stackAlignment = 8;
 
-  if (isTargetDarwin())
+  if (!isTargetDarwin())
+    UseMovt = hasV6T2Ops();
+  else {
     IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
+    UseMovt = DarwinUseMOVT && hasV6T2Ops();
+  }
 
   if (!isThumb() || hasThumb2())
     PostRAScheduler = true;
+
+  // v6+ may or may not support unaligned mem access depending on the system
+  // configuration.
+  if (!StrictAlign && hasV6Ops() && isTargetDarwin())
+    AllowsUnalignedMem = true;
 }
 
 /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
@@ -163,7 +202,7 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
       // through a stub.
       if (!isDecl && !GV->isWeakForLinker())
         return false;
-    
+
       // Unless we have a symbol with hidden visibility, we have to go through a
       // normal $non_lazy_ptr stub because this symbol might be resolved late.
       if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
@@ -174,6 +213,34 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
   return false;
 }
 
+unsigned ARMSubtarget::getMispredictionPenalty() const {
+  // If we have a reasonable estimate of the pipeline depth, then we can
+  // estimate the penalty of a misprediction based on that.
+  if (isCortexA8())
+    return 13;
+  else if (isCortexA9())
+    return 8;
+
+  // Otherwise, just return a sensible default.
+  return 10;
+}
+
+void ARMSubtarget::computeIssueWidth() {
+  unsigned allStage1Units = 0;
+  for (const InstrItinerary *itin = InstrItins.Itineraries;
+       itin->FirstStage != ~0U; ++itin) {
+    const InstrStage *IS = InstrItins.Stages + itin->FirstStage;
+    allStage1Units |= IS->getUnits();
+  }
+  InstrItins.IssueWidth = 0;
+  while (allStage1Units) {
+    ++InstrItins.IssueWidth;
+    // clear the lowest bit
+    allStage1Units ^= allStage1Units & ~(allStage1Units - 1);
+  }
+  assert(InstrItins.IssueWidth <= 2 && "itinerary bug, too many stage 1 units");
+}
+
 bool ARMSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtarget::AntiDepBreakMode& Mode,
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
index 67e5803..76c1c3f 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -17,7 +17,7 @@
 #include "llvm/Target/TargetInstrItineraries.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtarget.h"
-#include "ARMBaseRegisterInfo.h"
+#include "llvm/ADT/Triple.h"
 #include <string>
 
 namespace llvm {
@@ -29,6 +29,10 @@ protected:
     V4, V4T, V5T, V5TE, V6, V6M, V6T2, V7A, V7M
   };
 
+  enum ARMProcFamilyEnum {
+    Others, CortexA8, CortexA9
+  };
+
   enum ARMFPEnum {
     None, VFPv2, VFPv3, NEON
   };
@@ -42,6 +46,9 @@ protected:
   /// V6, V6T2, V7A, V7M.
   ARMArchEnum ARMArchVersion;
 
+  /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
+  ARMProcFamilyEnum ARMProcFamily;
+
   /// ARMFPUType - Floating Point Unit type.
   ARMFPEnum ARMFPUType;
 
@@ -50,9 +57,9 @@ protected:
   /// determine if NEON should actually be used.
   bool UseNEONForSinglePrecisionFP;
 
-  /// SlowVMLx - If the VFP2 instructions are available, indicates whether
-  /// the VML[AS] instructions are slow (if so, don't use them).
-  bool SlowVMLx;
+  /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
+  /// whether the FP VML[AS] instructions are slow (if so, don't use them).
+  bool SlowFPVMLx;
 
   /// SlowFPBrcc - True if floating point compare + branch is slow.
   bool SlowFPBrcc;
@@ -80,6 +87,10 @@ protected:
   /// only so far)
   bool HasFP16;
 
+  /// HasD16 - True if subtarget is limited to 16 double precision
+  /// FP registers for VFPv3.
+  bool HasD16;
+
   /// HasHardwareDivide - True if subtarget supports [su]div
   bool HasHardwareDivide;
 
@@ -95,10 +106,19 @@ protected:
   /// over 16-bit ones.
   bool Pref32BitThumb;
 
+  /// HasMPExtension - True if the subtarget supports Multiprocessing
+  /// extension (ARMv7 only).
+  bool HasMPExtension;
+
   /// FPOnlySP - If true, the floating point unit only supports single
   /// precision.
   bool FPOnlySP;
 
+  /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
+  /// accesses for some types.  For details, see
+  /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
+  bool AllowsUnalignedMem;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -106,6 +126,9 @@ protected:
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
 
@@ -136,6 +159,8 @@ protected:
   std::string ParseSubtargetFeatures(const std::string &FS,
                                      const std::string &CPU);
 
+  void computeIssueWidth();
+
   bool hasV4TOps()  const { return ARMArchVersion >= V4T;  }
   bool hasV5TOps()  const { return ARMArchVersion >= V5T;  }
   bool hasV5TEOps() const { return ARMArchVersion >= V5TE; }
@@ -143,6 +168,9 @@ protected:
   bool hasV6T2Ops() const { return ARMArchVersion >= V6T2; }
   bool hasV7Ops()   const { return ARMArchVersion >= V7A;  }
 
+  bool isCortexA8() const { return ARMProcFamily == CortexA8; }
+  bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+
   bool hasARMOps() const { return !NoARM; }
 
   bool hasVFP2() const { return ARMFPUType >= VFPv2; }
@@ -153,15 +181,17 @@ protected:
   bool hasDivide() const { return HasHardwareDivide; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
-  bool useVMLx() const {return hasVFP2() && !SlowVMLx; }
+  bool useFPVMLx() const { return !SlowFPVMLx; }
   bool isFPBrccSlow() const { return SlowFPBrcc; }
   bool isFPOnlySP() const { return FPOnlySP; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
+  bool hasMPExtension() const { return HasMPExtension; }
 
   bool hasFP16() const { return HasFP16; }
+  bool hasD16() const { return HasD16; }
 
-  bool isTargetDarwin() const { return TargetType == isDarwin; }
-  bool isTargetELF() const { return TargetType == isELF; }
+  bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; }
+  bool isTargetELF() const { return !isTargetDarwin(); }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
   bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
@@ -175,8 +205,12 @@ protected:
 
   bool useMovt() const { return UseMovt && hasV6T2Ops(); }
 
+  bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
+
   const std::string & getCPUString() const { return CPUString; }
 
+  unsigned getMispredictionPenalty() const;
+
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtarget::AntiDepBreakMode& Mode,
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 30ff827..0ee773b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -12,15 +12,18 @@
 
 #include "ARMTargetMachine.h"
 #include "ARMMCAsmInfo.h"
-#include "ARMFrameInfo.h"
+#include "ARMFrameLowering.h"
 #include "ARM.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
+static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
+
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
   switch (TheTriple.getOS()) {
@@ -31,6 +34,26 @@ static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   }
 }
 
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
+                                    MCContext &Ctx, TargetAsmBackend &TAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  switch (Triple(TT).getOS()) {
+  case Triple::Darwin:
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
+  case Triple::MinGW32:
+  case Triple::Cygwin:
+  case Triple::Win32:
+    llvm_unreachable("ARM does not support Windows COFF format");
+    return NULL;
+  default:
+    return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+  }
+}
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
@@ -39,6 +62,19 @@ extern "C" void LLVMInitializeARMTarget() {
   // Register the target asm info.
   RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo);
   RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
+  TargetRegistry::RegisterCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterAsmBackend(TheARMTarget, createARMAsmBackend);
+  TargetRegistry::RegisterAsmBackend(TheThumbTarget, createARMAsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterObjectStreamer(TheARMTarget, createMCStreamer);
+  TargetRegistry::RegisterObjectStreamer(TheThumbTarget, createMCStreamer);
+
 }
 
 /// TargetMachine ctor - Create an ARM architecture model.
@@ -49,9 +85,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
                                            bool isThumb)
   : LLVMTargetMachine(T, TT),
     Subtarget(TT, FS, isThumb),
-    FrameInfo(Subtarget),
     JITInfo(),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+    InstrItins(Subtarget.getInstrItineraryData())
+{
   DefRelocModel = getRelocationModel();
 }
 
@@ -59,12 +95,14 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
                                    const std::string &FS)
   : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
-               std::string("e-p:32:32-f64:32:32-i64:32:32-"
+               std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "v128:32:128-v64:32:64-n32") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
                            "v128:64:128-v64:64:64-n32")),
+    ELFWriterInfo(*this),
     TLInfo(*this),
-    TSInfo(*this) {
+    TSInfo(*this),
+    FrameLowering(Subtarget) {
   if (!Subtarget.hasARMOps())
     report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
                        "support ARM mode execution!");
@@ -77,14 +115,18 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
     DataLayout(Subtarget.isAPCS_ABI() ?
-               std::string("e-p:32:32-f64:32:32-i64:32:32-"
+               std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
                            "v128:32:128-v64:32:64-a:0:32-n32") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
                            "v128:64:128-v64:64:64-a:0:32-n32")),
+    ELFWriterInfo(*this),
     TLInfo(*this),
-    TSInfo(*this) {
+    TSInfo(*this),
+    FrameLowering(Subtarget.hasThumb2()
+              ? new ARMFrameLowering(Subtarget)
+              : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
 }
 
 // Pass Pipeline Configuration
@@ -104,12 +146,12 @@ bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM,
 
 bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel) {
-  if (Subtarget.hasNEON())
-    PM.add(createNEONPreAllocPass());
-
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass(true));
+  if (ExpandMLx &&
+      OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
+    PM.add(createMLxExpansionPass());
 
   return true;
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
index 17e5425..e0aa149 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -14,16 +14,19 @@
 #ifndef ARMTARGETMACHINE_H
 #define ARMTARGETMACHINE_H
 
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
 #include "ARMInstrInfo.h"
-#include "ARMFrameInfo.h"
+#include "ARMELFWriterInfo.h"
+#include "ARMFrameLowering.h"
 #include "ARMJITInfo.h"
 #include "ARMSubtarget.h"
 #include "ARMISelLowering.h"
 #include "ARMSelectionDAGInfo.h"
 #include "Thumb1InstrInfo.h"
+#include "Thumb1FrameLowering.h"
 #include "Thumb2InstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/OwningPtr.h"
 
 namespace llvm {
@@ -31,9 +34,7 @@ namespace llvm {
 class ARMBaseTargetMachine : public LLVMTargetMachine {
 protected:
   ARMSubtarget        Subtarget;
-
 private:
-  ARMFrameInfo        FrameInfo;
   ARMJITInfo          JITInfo;
   InstrItineraryData  InstrItins;
   Reloc::Model        DefRelocModel;    // Reloc model before it's overridden.
@@ -42,11 +43,10 @@ public:
   ARMBaseTargetMachine(const Target &T, const std::string &TT,
                        const std::string &FS, bool isThumb);
 
-  virtual const ARMFrameInfo     *getFrameInfo() const { return &FrameInfo; }
   virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
   virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
-  virtual const InstrItineraryData getInstrItineraryData() const {
-    return InstrItins;
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return &InstrItins;
   }
 
   // Pass Pipeline Configuration
@@ -64,9 +64,11 @@ public:
 class ARMTargetMachine : public ARMBaseTargetMachine {
   ARMInstrInfo        InstrInfo;
   const TargetData    DataLayout;       // Calculates type size & alignment
+  ARMELFWriterInfo    ELFWriterInfo;
   ARMTargetLowering   TLInfo;
   ARMSelectionDAGInfo TSInfo;
-public:
+  ARMFrameLowering    FrameLowering;
+ public:
   ARMTargetMachine(const Target &T, const std::string &TT,
                    const std::string &FS);
 
@@ -81,9 +83,15 @@ public:
   virtual const ARMSelectionDAGInfo* getSelectionDAGInfo() const {
     return &TSInfo;
   }
+  virtual const ARMFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
 
   virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ARMELFWriterInfo *getELFWriterInfo() const {
+    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  }
 };
 
 /// ThumbTargetMachine - Thumb target machine.
@@ -94,8 +102,11 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   // Either Thumb1InstrInfo or Thumb2InstrInfo.
   OwningPtr<ARMBaseInstrInfo> InstrInfo;
   const TargetData    DataLayout;   // Calculates type size & alignment
+  ARMELFWriterInfo    ELFWriterInfo;
   ARMTargetLowering   TLInfo;
   ARMSelectionDAGInfo TSInfo;
+  // Either Thumb1FrameLowering or ARMFrameLowering.
+  OwningPtr<ARMFrameLowering> FrameLowering;
 public:
   ThumbTargetMachine(const Target &T, const std::string &TT,
                      const std::string &FS);
@@ -117,7 +128,14 @@ public:
   virtual const ARMBaseInstrInfo *getInstrInfo() const {
     return InstrInfo.get();
   }
+  /// returns either Thumb1FrameLowering or ARMFrameLowering
+  virtual const ARMFrameLowering *getFrameLowering() const {
+    return FrameLowering.get();
+  }
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ARMELFWriterInfo *getELFWriterInfo() const {
+    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 091a3b3..7535da5 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -12,6 +12,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 using namespace dwarf;
@@ -26,14 +27,20 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
 
   if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) {
     StaticCtorSection =
-      getContext().getELFSection(".init_array", MCSectionELF::SHT_INIT_ARRAY,
-                                 MCSectionELF::SHF_WRITE |
-                                 MCSectionELF::SHF_ALLOC,
+      getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
+                                 ELF::SHF_WRITE |
+                                 ELF::SHF_ALLOC,
                                  SectionKind::getDataRel());
     StaticDtorSection =
-      getContext().getELFSection(".fini_array", MCSectionELF::SHT_FINI_ARRAY,
-                                 MCSectionELF::SHF_WRITE |
-                                 MCSectionELF::SHF_ALLOC,
+      getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
+                                 ELF::SHF_WRITE |
+                                 ELF::SHF_ALLOC,
                                  SectionKind::getDataRel());
   }
+  
+  AttributesSection =
+    getContext().getELFSection(".ARM.attributes",
+                               ELF::SHT_ARM_ATTRIBUTES,
+                               0,
+                               SectionKind::getMetadata());
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
index 097fc2c..c6a7261 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -18,10 +18,19 @@ class MCContext;
 class TargetMachine;
 
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+protected:
+  const MCSection *AttributesSection;
 public:
-  ARMElfTargetObjectFile() : TargetLoweringObjectFileELF() {}
+  ARMElfTargetObjectFile() :
+    TargetLoweringObjectFileELF(),
+    AttributesSection(NULL)
+  {}
 
   virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+  virtual const MCSection *getAttributesSection() const {
+    return AttributesSection;
+  }
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
index f859d1b..2428ce1 100644
--- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
@@ -10,10 +10,6 @@
 #include "ARM.h"
 #include "ARMTargetMachine.h"
 
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -22,119 +18,135 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegistry.h"
 
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+
 #include <string>
 #include <map>
 
 using namespace llvm;
 
 namespace {
-  
-  class ARMBaseAsmLexer : public TargetAsmLexer {
-    const MCAsmInfo &AsmInfo;
-    
-    const AsmToken &lexDefinite() {
-      return getLexer()->Lex();
-    }
-    
-    AsmToken LexTokenUAL();
-  protected:
-    typedef std::map <std::string, unsigned> rmap_ty;
-    
-    rmap_ty RegisterMap;
-    
-    void InitRegisterMap(const TargetRegisterInfo *info) {
-      unsigned numRegs = info->getNumRegs();
-
-      for (unsigned i = 0; i < numRegs; ++i) {
-        const char *regName = info->getName(i);
-        if (regName)
-          RegisterMap[regName] = i;
-      }
-    }
-    
-    unsigned MatchRegisterName(StringRef Name) {
-      rmap_ty::iterator iter = RegisterMap.find(Name.str());
-      if (iter != RegisterMap.end())
-        return iter->second;
-      else
-        return 0;
-    }
-    
-    AsmToken LexToken() {
-      if (!Lexer) {
-        SetError(SMLoc(), "No MCAsmLexer installed");
-        return AsmToken(AsmToken::Error, "", 0);
-      }
-      
-      switch (AsmInfo.getAssemblerDialect()) {
-      default:
-        SetError(SMLoc(), "Unhandled dialect");
-        return AsmToken(AsmToken::Error, "", 0);
-      case 0:
-        return LexTokenUAL();
-      }
-    }
-  public:
-    ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI)
-      : TargetAsmLexer(T), AsmInfo(MAI) {
+
+class ARMBaseAsmLexer : public TargetAsmLexer {
+  const MCAsmInfo &AsmInfo;
+
+  const AsmToken &lexDefinite() {
+    return getLexer()->Lex();
+  }
+
+  AsmToken LexTokenUAL();
+protected:
+  typedef std::map <std::string, unsigned> rmap_ty;
+
+  rmap_ty RegisterMap;
+
+  void InitRegisterMap(const TargetRegisterInfo *info) {
+    unsigned numRegs = info->getNumRegs();
+
+    for (unsigned i = 0; i < numRegs; ++i) {
+      const char *regName = info->getName(i);
+      if (regName)
+        RegisterMap[regName] = i;
     }
-  };
-  
-  class ARMAsmLexer : public ARMBaseAsmLexer {
-  public:
-    ARMAsmLexer(const Target &T, const MCAsmInfo &MAI)
-      : ARMBaseAsmLexer(T, MAI) {
-      std::string tripleString("arm-unknown-unknown");
-      std::string featureString;
-      OwningPtr<const TargetMachine> 
-        targetMachine(T.createTargetMachine(tripleString, featureString));
-      InitRegisterMap(targetMachine->getRegisterInfo());
+  }
+
+  unsigned MatchRegisterName(StringRef Name) {
+    rmap_ty::iterator iter = RegisterMap.find(Name.str());
+    if (iter != RegisterMap.end())
+      return iter->second;
+    else
+      return 0;
+  }
+
+  AsmToken LexToken() {
+    if (!Lexer) {
+      SetError(SMLoc(), "No MCAsmLexer installed");
+      return AsmToken(AsmToken::Error, "", 0);
     }
-  };
-  
-  class ThumbAsmLexer : public ARMBaseAsmLexer {
-  public:
-    ThumbAsmLexer(const Target &T, const MCAsmInfo &MAI)
-      : ARMBaseAsmLexer(T, MAI) {
-      std::string tripleString("thumb-unknown-unknown");
-      std::string featureString;
-      OwningPtr<const TargetMachine> 
-        targetMachine(T.createTargetMachine(tripleString, featureString));
-      InitRegisterMap(targetMachine->getRegisterInfo());
+
+    switch (AsmInfo.getAssemblerDialect()) {
+    default:
+      SetError(SMLoc(), "Unhandled dialect");
+      return AsmToken(AsmToken::Error, "", 0);
+    case 0:
+      return LexTokenUAL();
     }
-  };
-}
+  }
+public:
+  ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI)
+    : TargetAsmLexer(T), AsmInfo(MAI) {
+  }
+};
+
+class ARMAsmLexer : public ARMBaseAsmLexer {
+public:
+  ARMAsmLexer(const Target &T, const MCAsmInfo &MAI)
+    : ARMBaseAsmLexer(T, MAI) {
+    std::string tripleString("arm-unknown-unknown");
+    std::string featureString;
+    OwningPtr<const TargetMachine>
+      targetMachine(T.createTargetMachine(tripleString, featureString));
+    InitRegisterMap(targetMachine->getRegisterInfo());
+  }
+};
+
+class ThumbAsmLexer : public ARMBaseAsmLexer {
+public:
+  ThumbAsmLexer(const Target &T, const MCAsmInfo &MAI)
+    : ARMBaseAsmLexer(T, MAI) {
+    std::string tripleString("thumb-unknown-unknown");
+    std::string featureString;
+    OwningPtr<const TargetMachine>
+      targetMachine(T.createTargetMachine(tripleString, featureString));
+    InitRegisterMap(targetMachine->getRegisterInfo());
+  }
+};
+
+} // end anonymous namespace
 
 AsmToken ARMBaseAsmLexer::LexTokenUAL() {
   const AsmToken &lexedToken = lexDefinite();
-  
+
   switch (lexedToken.getKind()) {
-  default:
-    return AsmToken(lexedToken);
+  default: break;
   case AsmToken::Error:
     SetError(Lexer->getErrLoc(), Lexer->getErr());
-    return AsmToken(lexedToken);
-  case AsmToken::Identifier:
-  {
+    break;
+  case AsmToken::Identifier: {
     std::string upperCase = lexedToken.getString().str();
     std::string lowerCase = LowercaseString(upperCase);
     StringRef lowerRef(lowerCase);
-    
+
     unsigned regID = MatchRegisterName(lowerRef);
-    
-    if (regID) {
+    // Check for register aliases.
+    //   r13 -> sp
+    //   r14 -> lr
+    //   r15 -> pc
+    //   ip  -> r12
+    //   FIXME: Some assemblers support lots of others. Do we want them all?
+    if (!regID) {
+      regID = StringSwitch<unsigned>(lowerCase)
+        .Case("r13", ARM::SP)
+        .Case("r14", ARM::LR)
+        .Case("r15", ARM::PC)
+        .Case("ip", ARM::R12)
+        .Default(0);
+    }
+
+    if (regID)
       return AsmToken(AsmToken::Register,
                       lexedToken.getString(),
                       static_cast<int64_t>(regID));
-    } else {
-      return AsmToken(lexedToken);
-    }
   }
   }
+
+  return AsmToken(lexedToken);
 }
 
 extern "C" void LLVMInitializeARMAsmLexer() {
   RegisterAsmLexer<ARMAsmLexer> X(TheARMTarget);
   RegisterAsmLexer<ThumbAsmLexer> Y(TheThumbTarget);
 }
-
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 75e2a73..129af20 100644
--- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -8,28 +8,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMMCExpr.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMSubtarget.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmParser.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 using namespace llvm;
 
-namespace {
-struct ARMOperand;
-
-// The shift types for register controlled shifts in arm memory addressing
+/// Shift types used for register controlled shifts in ARM memory addressing.
 enum ShiftType {
   Lsl,
   Lsr,
@@ -38,24 +38,30 @@ enum ShiftType {
   Rrx
 };
 
+namespace {
+
+class ARMOperand;
+
 class ARMAsmParser : public TargetAsmParser {
   MCAsmParser &Parser;
   TargetMachine &TM;
 
-private:
   MCAsmParser &getParser() const { return Parser; }
-
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-
   bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
 
-  bool MaybeParseRegister(OwningPtr<ARMOperand> &Op, bool ParseWriteBack);
+  int TryParseRegister();
+  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
+  bool ParsePrefix(ARMMCExpr::VariantKind &RefKind);
+  const MCExpr *ApplyPrefixToExpr(const MCExpr *E,
+                                  MCSymbolRefExpr::VariantKind Variant);
 
-  bool ParseRegisterList(OwningPtr<ARMOperand> &Op);
-
-  bool ParseMemory(OwningPtr<ARMOperand> &Op);
 
   bool ParseMemoryOffsetReg(bool &Negative,
                             bool &OffsetRegShifted,
@@ -65,70 +71,76 @@ private:
                             bool &OffsetIsReg,
                             int &OffsetRegNum,
                             SMLoc &E);
-
   bool ParseShift(enum ShiftType &St, const MCExpr *&ShiftAmount, SMLoc &E);
-
-  bool ParseOperand(OwningPtr<ARMOperand> &Op);
-
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
-
   bool ParseDirectiveThumb(SMLoc L);
-
   bool ParseDirectiveThumbFunc(SMLoc L);
-
   bool ParseDirectiveCode(SMLoc L);
-
   bool ParseDirectiveSyntax(SMLoc L);
 
-  bool MatchInstruction(SMLoc IDLoc,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCInst &Inst) {
-    if (!MatchInstructionImpl(Operands, Inst))
-      return false;
-
-    // FIXME: We should give nicer diagnostics about the exact failure.
-    Error(IDLoc, "unrecognized instruction");
-
-    return true;
-  }
+  bool MatchAndEmitInstruction(SMLoc IDLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out);
+  void GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+                             bool &CanAcceptPredicationCode);
 
   /// @name Auto-generated Match Functions
   /// {
 
-  unsigned ComputeAvailableFeatures(const ARMSubtarget *Subtarget) const;
-
-  bool MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*>
-                              &Operands,
-                            MCInst &Inst);
+#define GET_ASSEMBLER_HEADER
+#include "ARMGenAsmMatcher.inc"
 
   /// }
 
+  OperandMatchResultTy tryParseCoprocNumOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy tryParseCoprocRegOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy tryParseMemBarrierOptOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy tryParseProcIFlagsOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy tryParseMSRMaskOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
 
 public:
   ARMAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &_TM)
-    : TargetAsmParser(T), Parser(_Parser), TM(_TM) {}
+    : TargetAsmParser(T), Parser(_Parser), TM(_TM) {
+      // Initialize the set of available features.
+      setAvailableFeatures(ComputeAvailableFeatures(
+          &TM.getSubtarget<ARMSubtarget>()));
+    }
 
   virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
   virtual bool ParseDirective(AsmToken DirectiveID);
 };
-  
+} // end anonymous namespace
+
+namespace {
+
 /// ARMOperand - Instances of this class represent a parsed ARM machine
 /// instruction.
-struct ARMOperand : public MCParsedAsmOperand {
-private:
-  ARMOperand() {}
-public:
+class ARMOperand : public MCParsedAsmOperand {
   enum KindTy {
     CondCode,
+    CCOut,
+    CoprocNum,
+    CoprocReg,
     Immediate,
+    MemBarrierOpt,
     Memory,
+    MSRMask,
+    ProcIFlags,
     Register,
+    RegisterList,
+    DPRRegisterList,
+    SPRRegisterList,
     Token
   } Kind;
 
   SMLoc StartLoc, EndLoc;
+  SmallVector<unsigned, 8> Registers;
 
   union {
     struct {
@@ -136,40 +148,54 @@ public:
     } CC;
 
     struct {
+      ARM_MB::MemBOpt Val;
+    } MBOpt;
+
+    struct {
+      unsigned Val;
+    } Cop;
+
+    struct {
+      ARM_PROC::IFlags Val;
+    } IFlags;
+
+    struct {
+      unsigned Val;
+    } MMask;
+
+    struct {
       const char *Data;
       unsigned Length;
     } Tok;
 
     struct {
       unsigned RegNum;
-      bool Writeback;
     } Reg;
 
     struct {
       const MCExpr *Val;
     } Imm;
-    
-    // This is for all forms of ARM address expressions
+
+    /// Combined record for all forms of ARM address expressions.
     struct {
       unsigned BaseRegNum;
-      unsigned OffsetRegNum; // used when OffsetIsReg is true
-      const MCExpr *Offset; // used when OffsetIsReg is false
-      const MCExpr *ShiftAmount; // used when OffsetRegShifted is true
-      enum ShiftType ShiftType;  // used when OffsetRegShifted is true
-      unsigned
-        OffsetRegShifted : 1, // only used when OffsetIsReg is true
-        Preindexed : 1,
-        Postindexed : 1,
-        OffsetIsReg : 1,
-        Negative : 1, // only used when OffsetIsReg is true
-        Writeback : 1;
+      union {
+        unsigned RegNum;     ///< Offset register num, when OffsetIsReg.
+        const MCExpr *Value; ///< Offset value, when !OffsetIsReg.
+      } Offset;
+      const MCExpr *ShiftAmount;     // used when OffsetRegShifted is true
+      enum ShiftType ShiftType;      // used when OffsetRegShifted is true
+      unsigned OffsetRegShifted : 1; // only used when OffsetIsReg is true
+      unsigned Preindexed       : 1;
+      unsigned Postindexed      : 1;
+      unsigned OffsetIsReg      : 1;
+      unsigned Negative         : 1; // only used when OffsetIsReg is true
+      unsigned Writeback        : 1;
     } Mem;
-
   };
-  
-  //ARMOperand(KindTy K, SMLoc S, SMLoc E)
-  //  : Kind(K), StartLoc(S), EndLoc(E) {}
-  
+
+  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+public:
   ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
     Kind = o.Kind;
     StartLoc = o.StartLoc;
@@ -181,18 +207,36 @@ public:
     case Token:
       Tok = o.Tok;
       break;
+    case CCOut:
     case Register:
       Reg = o.Reg;
       break;
+    case RegisterList:
+    case DPRRegisterList:
+    case SPRRegisterList:
+      Registers = o.Registers;
+      break;
+    case CoprocNum:
+    case CoprocReg:
+      Cop = o.Cop;
+      break;
     case Immediate:
       Imm = o.Imm;
       break;
+    case MemBarrierOpt:
+      MBOpt = o.MBOpt;
+      break;
     case Memory:
       Mem = o.Mem;
       break;
+    case MSRMask:
+      MMask = o.MMask;
+      break;
+    case ProcIFlags:
+      IFlags = o.IFlags;
     }
   }
-  
+
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
@@ -203,32 +247,129 @@ public:
     return CC.Val;
   }
 
+  unsigned getCoproc() const {
+    assert((Kind == CoprocNum || Kind == CoprocReg) && "Invalid access!");
+    return Cop.Val;
+  }
+
   StringRef getToken() const {
     assert(Kind == Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
 
   unsigned getReg() const {
-    assert(Kind == Register && "Invalid access!");
+    assert((Kind == Register || Kind == CCOut) && "Invalid access!");
     return Reg.RegNum;
   }
 
+  const SmallVectorImpl<unsigned> &getRegList() const {
+    assert((Kind == RegisterList || Kind == DPRRegisterList ||
+            Kind == SPRRegisterList) && "Invalid access!");
+    return Registers;
+  }
+
   const MCExpr *getImm() const {
     assert(Kind == Immediate && "Invalid access!");
     return Imm.Val;
   }
 
-  bool isCondCode() const { return Kind == CondCode; }
+  ARM_MB::MemBOpt getMemBarrierOpt() const {
+    assert(Kind == MemBarrierOpt && "Invalid access!");
+    return MBOpt.Val;
+  }
 
-  bool isImm() const { return Kind == Immediate; }
+  ARM_PROC::IFlags getProcIFlags() const {
+    assert(Kind == ProcIFlags && "Invalid access!");
+    return IFlags.Val;
+  }
+
+  unsigned getMSRMask() const {
+    assert(Kind == MSRMask && "Invalid access!");
+    return MMask.Val;
+  }
+
+  /// @name Memory Operand Accessors
+  /// @{
+
+  unsigned getMemBaseRegNum() const {
+    return Mem.BaseRegNum;
+  }
+  unsigned getMemOffsetRegNum() const {
+    assert(Mem.OffsetIsReg && "Invalid access!");
+    return Mem.Offset.RegNum;
+  }
+  const MCExpr *getMemOffset() const {
+    assert(!Mem.OffsetIsReg && "Invalid access!");
+    return Mem.Offset.Value;
+  }
+  unsigned getMemOffsetRegShifted() const {
+    assert(Mem.OffsetIsReg && "Invalid access!");
+    return Mem.OffsetRegShifted;
+  }
+  const MCExpr *getMemShiftAmount() const {
+    assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
+    return Mem.ShiftAmount;
+  }
+  enum ShiftType getMemShiftType() const {
+    assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
+    return Mem.ShiftType;
+  }
+  bool getMemPreindexed() const { return Mem.Preindexed; }
+  bool getMemPostindexed() const { return Mem.Postindexed; }
+  bool getMemOffsetIsReg() const { return Mem.OffsetIsReg; }
+  bool getMemNegative() const { return Mem.Negative; }
+  bool getMemWriteback() const { return Mem.Writeback; }
+
+  /// @}
 
+  bool isCoprocNum() const { return Kind == CoprocNum; }
+  bool isCoprocReg() const { return Kind == CoprocReg; }
+  bool isCondCode() const { return Kind == CondCode; }
+  bool isCCOut() const { return Kind == CCOut; }
+  bool isImm() const { return Kind == Immediate; }
   bool isReg() const { return Kind == Register; }
+  bool isRegList() const { return Kind == RegisterList; }
+  bool isDPRRegList() const { return Kind == DPRRegisterList; }
+  bool isSPRRegList() const { return Kind == SPRRegisterList; }
+  bool isToken() const { return Kind == Token; }
+  bool isMemBarrierOpt() const { return Kind == MemBarrierOpt; }
+  bool isMemory() const { return Kind == Memory; }
+  bool isMemMode5() const {
+    if (!isMemory() || getMemOffsetIsReg() || getMemWriteback() ||
+        getMemNegative())
+      return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    if (!CE) return false;
+
+    // The offset must be a multiple of 4 in the range 0-1020.
+    int64_t Value = CE->getValue();
+    return ((Value & 0x3) == 0 && Value <= 1020 && Value >= -1020);
+  }
+  bool isMemModeRegThumb() const {
+    if (!isMemory() || !getMemOffsetIsReg() || getMemWriteback())
+      return false;
+    return true;
+  }
+  bool isMemModeImmThumb() const {
+    if (!isMemory() || getMemOffsetIsReg() || getMemWriteback())
+      return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    if (!CE) return false;
 
-  bool isToken() const {return Kind == Token; }
+    // The offset must be a multiple of 4 in the range 0-124.
+    uint64_t Value = CE->getValue();
+    return ((Value & 0x3) == 0 && Value <= 124);
+  }
+  bool isMSRMask() const { return Kind == MSRMask; }
+  bool isProcIFlags() const { return Kind == ProcIFlags; }
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
     else
       Inst.addOperand(MCOperand::CreateExpr(Expr));
@@ -237,8 +378,23 @@ public:
   void addCondCodeOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
-    // FIXME: What belongs here?
-    Inst.addOperand(MCOperand::CreateReg(0));
+    unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+    Inst.addOperand(MCOperand::CreateReg(RegNum));
+  }
+
+  void addCoprocNumOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCoproc()));
+  }
+
+  void addCoprocRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCoproc()));
+  }
+
+  void addCCOutOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -246,66 +402,181 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
+  void addRegListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ++I)
+      Inst.addOperand(MCOperand::CreateReg(*I));
+  }
+
+  void addDPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
+  void addSPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
 
+  void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
+  }
+
+  void addMemMode5Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemMode5() && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
+    assert(!getMemOffsetIsReg() && "Invalid mode 5 operand");
+
+    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
+    // the difference?
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    assert(CE && "Non-constant mode 5 offset operand!");
+
+    // The MCInst offset operand doesn't include the low two bits (like
+    // the instruction encoding).
+    int64_t Offset = CE->getValue() / 4;
+    if (Offset >= 0)
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add,
+                                                             Offset)));
+    else
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub,
+                                                             -Offset)));
+  }
+
+  void addMemModeRegThumbOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemModeRegThumb() && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
+    Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+  }
+
+  void addMemModeImmThumbOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemModeImmThumb() && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    assert(CE && "Non-constant mode offset operand!");
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  }
+
+  void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getMSRMask())));
+  }
+
+  void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags())));
+  }
+
   virtual void dump(raw_ostream &OS) const;
 
-  static void CreateCondCode(OwningPtr<ARMOperand> &Op, ARMCC::CondCodes CC,
-                             SMLoc S) {
-    Op.reset(new ARMOperand);
-    Op->Kind = CondCode;
+  static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(CondCode);
     Op->CC.Val = CC;
     Op->StartLoc = S;
     Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(CoprocNum);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
 
-  static void CreateToken(OwningPtr<ARMOperand> &Op, StringRef Str,
-                          SMLoc S) {
-    Op.reset(new ARMOperand);
-    Op->Kind = Token;
+  static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(CoprocReg);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(CCOut);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateToken(StringRef Str, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
     Op->EndLoc = S;
+    return Op;
   }
 
-  static void CreateReg(OwningPtr<ARMOperand> &Op, unsigned RegNum, 
-                        bool Writeback, SMLoc S, SMLoc E) {
-    Op.reset(new ARMOperand);
-    Op->Kind = Register;
+  static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(Register);
     Op->Reg.RegNum = RegNum;
-    Op->Reg.Writeback = Writeback;
-    
     Op->StartLoc = S;
     Op->EndLoc = E;
+    return Op;
   }
 
-  static void CreateImm(OwningPtr<ARMOperand> &Op, const MCExpr *Val,
-                        SMLoc S, SMLoc E) {
-    Op.reset(new ARMOperand);
-    Op->Kind = Immediate;
+  static ARMOperand *
+  CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs,
+                SMLoc StartLoc, SMLoc EndLoc) {
+    KindTy Kind = RegisterList;
+
+    if (ARM::DPRRegClass.contains(Regs.front().first))
+      Kind = DPRRegisterList;
+    else if (ARM::SPRRegClass.contains(Regs.front().first))
+      Kind = SPRRegisterList;
+
+    ARMOperand *Op = new ARMOperand(Kind);
+    for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
+           I = Regs.begin(), E = Regs.end(); I != E; ++I)
+      Op->Registers.push_back(I->first);
+    array_pod_sort(Op->Registers.begin(), Op->Registers.end());
+    Op->StartLoc = StartLoc;
+    Op->EndLoc = EndLoc;
+    return Op;
+  }
+
+  static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(Immediate);
     Op->Imm.Val = Val;
-    
     Op->StartLoc = S;
     Op->EndLoc = E;
+    return Op;
   }
 
-  static void CreateMem(OwningPtr<ARMOperand> &Op,
-                        unsigned BaseRegNum, bool OffsetIsReg,
-                        const MCExpr *Offset, unsigned OffsetRegNum,
-                        bool OffsetRegShifted, enum ShiftType ShiftType,
-                        const MCExpr *ShiftAmount, bool Preindexed,
-                        bool Postindexed, bool Negative, bool Writeback,
-                        SMLoc S, SMLoc E) {
-    Op.reset(new ARMOperand);
-    Op->Kind = Memory;
+  static ARMOperand *CreateMem(unsigned BaseRegNum, bool OffsetIsReg,
+                               const MCExpr *Offset, int OffsetRegNum,
+                               bool OffsetRegShifted, enum ShiftType ShiftType,
+                               const MCExpr *ShiftAmount, bool Preindexed,
+                               bool Postindexed, bool Negative, bool Writeback,
+                               SMLoc S, SMLoc E) {
+    assert((OffsetRegNum == -1 || OffsetIsReg) &&
+           "OffsetRegNum must imply OffsetIsReg!");
+    assert((!OffsetRegShifted || OffsetIsReg) &&
+           "OffsetRegShifted must imply OffsetIsReg!");
+    assert((Offset || OffsetIsReg) &&
+           "Offset must exists unless register offset is used!");
+    assert((!ShiftAmount || (OffsetIsReg && OffsetRegShifted)) &&
+           "Cannot have shift amount without shifted register offset!");
+    assert((!Offset || !OffsetIsReg) &&
+           "Cannot have expression offset and register offset!");
+
+    ARMOperand *Op = new ARMOperand(Memory);
     Op->Mem.BaseRegNum = BaseRegNum;
     Op->Mem.OffsetIsReg = OffsetIsReg;
-    Op->Mem.Offset = Offset;
-    Op->Mem.OffsetRegNum = OffsetRegNum;
+    if (OffsetIsReg)
+      Op->Mem.Offset.RegNum = OffsetRegNum;
+    else
+      Op->Mem.Offset.Value = Offset;
     Op->Mem.OffsetRegShifted = OffsetRegShifted;
     Op->Mem.ShiftType = ShiftType;
     Op->Mem.ShiftAmount = ShiftAmount;
@@ -313,9 +584,34 @@ public:
     Op->Mem.Postindexed = Postindexed;
     Op->Mem.Negative = Negative;
     Op->Mem.Writeback = Writeback;
-    
+
     Op->StartLoc = S;
     Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(MemBarrierOpt);
+    Op->MBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(ProcIFlags);
+    Op->IFlags.Val = IFlags;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(MSRMask);
+    Op->MMask.Val = MMask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
 };
 
@@ -324,17 +620,77 @@ public:
 void ARMOperand::dump(raw_ostream &OS) const {
   switch (Kind) {
   case CondCode:
-    OS << ARMCondCodeToString(getCondCode());
+    OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
+    break;
+  case CCOut:
+    OS << "<ccout " << getReg() << ">";
+    break;
+  case CoprocNum:
+    OS << "<coprocessor number: " << getCoproc() << ">";
+    break;
+  case CoprocReg:
+    OS << "<coprocessor register: " << getCoproc() << ">";
+    break;
+  case MSRMask:
+    OS << "<mask: " << getMSRMask() << ">";
     break;
   case Immediate:
     getImm()->print(OS);
     break;
+  case MemBarrierOpt:
+    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
+    break;
   case Memory:
-    OS << "<memory>";
+    OS << "<memory "
+       << "base:" << getMemBaseRegNum();
+    if (getMemOffsetIsReg()) {
+      OS << " offset:<register " << getMemOffsetRegNum();
+      if (getMemOffsetRegShifted()) {
+        OS << " offset-shift-type:" << getMemShiftType();
+        OS << " offset-shift-amount:" << *getMemShiftAmount();
+      }
+    } else {
+      OS << " offset:" << *getMemOffset();
+    }
+    if (getMemOffsetIsReg())
+      OS << " (offset-is-reg)";
+    if (getMemPreindexed())
+      OS << " (pre-indexed)";
+    if (getMemPostindexed())
+      OS << " (post-indexed)";
+    if (getMemNegative())
+      OS << " (negative)";
+    if (getMemWriteback())
+      OS << " (writeback)";
+    OS << ">";
+    break;
+  case ProcIFlags: {
+    OS << "<ARM_PROC::";
+    unsigned IFlags = getProcIFlags();
+    for (int i=2; i >= 0; --i)
+      if (IFlags & (1 << i))
+        OS << ARM_PROC::IFlagsToString(1 << i);
+    OS << ">";
     break;
+  }
   case Register:
     OS << "<register " << getReg() << ">";
     break;
+  case RegisterList:
+  case DPRRegisterList:
+  case SPRRegisterList: {
+    OS << "<register_list ";
+
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ) {
+      OS << *I;
+      if (++I < E) OS << ", ";
+    }
+
+    OS << ">";
+    break;
+  }
   case Token:
     OS << "'" << getToken() << "'";
     break;
@@ -348,184 +704,456 @@ static unsigned MatchRegisterName(StringRef Name);
 
 /// }
 
+bool ARMAsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  RegNo = TryParseRegister();
+
+  return (RegNo == (unsigned)-1);
+}
+
 /// Try to parse a register name.  The token must be an Identifier when called,
-/// and if it is a register name a Reg operand is created, the token is eaten
-/// and false is returned.  Else true is returned and no token is eaten.
-/// TODO this is likely to change to allow different register types and or to
-/// parse for a specific register type.
-bool ARMAsmParser::MaybeParseRegister
-  (OwningPtr<ARMOperand> &Op, bool ParseWriteBack) {
-  SMLoc S, E;
+/// and if it is a register name the token is eaten and the register number is
+/// returned.  Otherwise return -1.
+///
+int ARMAsmParser::TryParseRegister() {
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
   // FIXME: Validate register for the current architecture; we have to do
   // validation later, so maybe there is no need for this here.
-  int RegNum;
+  std::string upperCase = Tok.getString().str();
+  std::string lowerCase = LowercaseString(upperCase);
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  if (!RegNum) {
+    RegNum = StringSwitch<unsigned>(lowerCase)
+      .Case("r13", ARM::SP)
+      .Case("r14", ARM::LR)
+      .Case("r15", ARM::PC)
+      .Case("ip", ARM::R12)
+      .Default(0);
+  }
+  if (!RegNum) return -1;
 
-  RegNum = MatchRegisterName(Tok.getString());
-  if (RegNum == -1)
-    return true;
-  
-  S = Tok.getLoc();
-  
   Parser.Lex(); // Eat identifier token.
-    
-  E = Parser.getTok().getLoc();
+  return RegNum;
+}
 
-  bool Writeback = false;
-  if (ParseWriteBack) {
-    const AsmToken &ExclaimTok = Parser.getTok();
-    if (ExclaimTok.is(AsmToken::Exclaim)) {
-      E = ExclaimTok.getLoc();
-      Writeback = true;
-      Parser.Lex(); // Eat exclaim token
+/// Try to parse a register name.  The token must be an Identifier when called.
+/// If it's a register, an AsmOperand is created. Another AsmOperand is created
+/// if there is a "writeback". 'true' if it's not a register.
+///
+/// TODO this is likely to change to allow different register types and or to
+/// parse for a specific register type.
+bool ARMAsmParser::
+TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  int RegNo = TryParseRegister();
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(ARMOperand::CreateReg(RegNo, S, Parser.getTok().getLoc()));
+
+  const AsmToken &ExclaimTok = Parser.getTok();
+  if (ExclaimTok.is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(),
+                                               ExclaimTok.getLoc()));
+    Parser.Lex(); // Eat exclaim token
+  }
+
+  return false;
+}
+
+/// MatchCoprocessorOperandName - Try to parse an coprocessor related
+/// instruction with a symbolic operand name. Example: "p1", "p7", "c3",
+/// "c5", ...
+static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
+  // Use the same layout as the tablegen'erated register name matcher. Ugly,
+  // but efficient.
+  switch (Name.size()) {
+  default: break;
+  case 2:
+    if (Name[0] != CoprocOp)
+      return -1;
+    switch (Name[1]) {
+    default:  return -1;
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    }
+    break;
+  case 3:
+    if (Name[0] != CoprocOp || Name[1] != '1')
+      return -1;
+    switch (Name[2]) {
+    default:  return -1;
+    case '0': return 10;
+    case '1': return 11;
+    case '2': return 12;
+    case '3': return 13;
+    case '4': return 14;
+    case '5': return 15;
     }
+    break;
   }
 
-  ARMOperand::CreateReg(Op, RegNum, Writeback, S, E);
+  return -1;
+}
 
-  return false;
+/// tryParseCoprocNumOperand - Try to parse an coprocessor number operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+tryParseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
+  if (Num == -1)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateCoprocNum(Num, S));
+  return MatchOperand_Success;
 }
 
-/// Parse a register list, return false if successful else return true or an 
-/// error.  The first token must be a '{' when called.
-bool ARMAsmParser::ParseRegisterList(OwningPtr<ARMOperand> &Op) {
-  SMLoc S, E;
-  assert(Parser.getTok().is(AsmToken::LCurly) &&
-         "Token is not an Left Curly Brace");
-  S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat left curly brace token.
-
-  const AsmToken &RegTok = Parser.getTok();
-  SMLoc RegLoc = RegTok.getLoc();
-  if (RegTok.isNot(AsmToken::Identifier))
-    return Error(RegLoc, "register expected");
-  int RegNum = MatchRegisterName(RegTok.getString());
-  if (RegNum == -1)
-    return Error(RegLoc, "register expected");
+/// tryParseCoprocRegOperand - Try to parse an coprocessor register operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+tryParseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
+  if (Reg == -1)
+    return MatchOperand_NoMatch;
+
   Parser.Lex(); // Eat identifier token.
-  unsigned RegList = 1 << RegNum;
+  Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S));
+  return MatchOperand_Success;
+}
 
-  int HighRegNum = RegNum;
-  // TODO ranges like "{Rn-Rm}"
-  while (Parser.getTok().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat comma token.
+/// Parse a register list, return it if successful else return null.  The first
+/// token must be a '{' when called.
+bool ARMAsmParser::
+ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) &&
+         "Token is not a Left Curly Brace");
+  SMLoc S = Parser.getTok().getLoc();
+
+  // Read the rest of the registers in the list.
+  unsigned PrevRegNum = 0;
+  SmallVector<std::pair<unsigned, SMLoc>, 32> Registers;
+
+  do {
+    bool IsRange = Parser.getTok().is(AsmToken::Minus);
+    Parser.Lex(); // Eat non-identifier token.
 
     const AsmToken &RegTok = Parser.getTok();
     SMLoc RegLoc = RegTok.getLoc();
-    if (RegTok.isNot(AsmToken::Identifier))
-      return Error(RegLoc, "register expected");
-    int RegNum = MatchRegisterName(RegTok.getString());
-    if (RegNum == -1)
-      return Error(RegLoc, "register expected");
+    if (RegTok.isNot(AsmToken::Identifier)) {
+      Error(RegLoc, "register expected");
+      return true;
+    }
 
-    if (RegList & (1 << RegNum))
-      Warning(RegLoc, "register duplicated in register list");
-    else if (RegNum <= HighRegNum)
-      Warning(RegLoc, "register not in ascending order in register list");
-    RegList |= 1 << RegNum;
-    HighRegNum = RegNum;
+    int RegNum = TryParseRegister();
+    if (RegNum == -1) {
+      Error(RegLoc, "register expected");
+      return true;
+    }
 
-    Parser.Lex(); // Eat identifier token.
-  }
+    if (IsRange) {
+      int Reg = PrevRegNum;
+      do {
+        ++Reg;
+        Registers.push_back(std::make_pair(Reg, RegLoc));
+      } while (Reg != RegNum);
+    } else {
+      Registers.push_back(std::make_pair(RegNum, RegLoc));
+    }
+
+    PrevRegNum = RegNum;
+  } while (Parser.getTok().is(AsmToken::Comma) ||
+           Parser.getTok().is(AsmToken::Minus));
+
+  // Process the right curly brace of the list.
   const AsmToken &RCurlyTok = Parser.getTok();
-  if (RCurlyTok.isNot(AsmToken::RCurly))
-    return Error(RCurlyTok.getLoc(), "'}' expected");
-  E = RCurlyTok.getLoc();
-  Parser.Lex(); // Eat left curly brace token.
+  if (RCurlyTok.isNot(AsmToken::RCurly)) {
+    Error(RCurlyTok.getLoc(), "'}' expected");
+    return true;
+  }
+
+  SMLoc E = RCurlyTok.getLoc();
+  Parser.Lex(); // Eat right curly brace token.
+
+  // Verify the register list.
+  SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
+    RI = Registers.begin(), RE = Registers.end();
+
+  unsigned HighRegNum = getARMRegisterNumbering(RI->first);
+  bool EmittedWarning = false;
+
+  DenseMap<unsigned, bool> RegMap;
+  RegMap[HighRegNum] = true;
 
+  for (++RI; RI != RE; ++RI) {
+    const std::pair<unsigned, SMLoc> &RegInfo = *RI;
+    unsigned Reg = getARMRegisterNumbering(RegInfo.first);
+
+    if (RegMap[Reg]) {
+      Error(RegInfo.second, "register duplicated in register list");
+      return true;
+    }
+
+    if (!EmittedWarning && Reg < HighRegNum)
+      Warning(RegInfo.second,
+              "register not in ascending order in register list");
+
+    RegMap[Reg] = true;
+    HighRegNum = std::max(Reg, HighRegNum);
+  }
+
+  Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
   return false;
 }
 
-/// Parse an arm memory expression, return false if successful else return true
+/// tryParseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+tryParseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  StringRef OptStr = Tok.getString();
+
+  unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()))
+    .Case("sy",    ARM_MB::SY)
+    .Case("st",    ARM_MB::ST)
+    .Case("ish",   ARM_MB::ISH)
+    .Case("ishst", ARM_MB::ISHST)
+    .Case("nsh",   ARM_MB::NSH)
+    .Case("nshst", ARM_MB::NSHST)
+    .Case("osh",   ARM_MB::OSH)
+    .Case("oshst", ARM_MB::OSHST)
+    .Default(~0U);
+
+  if (Opt == ~0U)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+/// tryParseProcIFlagsOperand - Try to parse iflags from CPS instruction.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+tryParseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  StringRef IFlagsStr = Tok.getString();
+
+  unsigned IFlags = 0;
+  for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
+    unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
+    .Case("a", ARM_PROC::A)
+    .Case("i", ARM_PROC::I)
+    .Case("f", ARM_PROC::F)
+    .Default(~0U);
+
+    // If some specific iflag is already set, it means that some letter is
+    // present more than once, this is not acceptable.
+    if (Flag == ~0U || (IFlags & Flag))
+      return MatchOperand_NoMatch;
+
+    IFlags |= Flag;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S));
+  return MatchOperand_Success;
+}
+
+/// tryParseMSRMaskOperand - Try to parse mask flags from MSR instruction.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  StringRef Mask = Tok.getString();
+
+  // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf"
+  size_t Start = 0, Next = Mask.find('_');
+  StringRef Flags = "";
+  StringRef SpecReg = Mask.slice(Start, Next);
+  if (Next != StringRef::npos)
+    Flags = Mask.slice(Next+1, Mask.size());
+
+  // FlagsVal contains the complete mask:
+  // 3-0: Mask
+  // 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  unsigned FlagsVal = 0;
+
+  if (SpecReg == "apsr") {
+    FlagsVal = StringSwitch<unsigned>(Flags)
+    .Case("nzcvq",  0x8) // same as CPSR_c
+    .Case("g",      0x4) // same as CPSR_s
+    .Case("nzcvqg", 0xc) // same as CPSR_fs
+    .Default(~0U);
+
+    if (FlagsVal == ~0U) {
+      if (!Flags.empty())
+        return MatchOperand_NoMatch;
+      else
+        FlagsVal = 0; // No flag
+    }
+  } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
+    for (int i = 0, e = Flags.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
+      .Case("c", 1)
+      .Case("x", 2)
+      .Case("s", 4)
+      .Case("f", 8)
+      .Default(~0U);
+
+      // If some specific flag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (FlagsVal == ~0U || (FlagsVal & Flag))
+        return MatchOperand_NoMatch;
+      FlagsVal |= Flag;
+    }
+  } else // No match for special register.
+    return MatchOperand_NoMatch;
+
+  // Special register without flags are equivalent to "fc" flags.
+  if (!FlagsVal)
+    FlagsVal = 0x9;
+
+  // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  if (SpecReg == "spsr")
+    FlagsVal |= 16;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+  return MatchOperand_Success;
+}
+
+/// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
+///
 /// TODO Only preindexing and postindexing addressing are started, unindexed
 /// with option, etc are still to do.
-bool ARMAsmParser::ParseMemory(OwningPtr<ARMOperand> &Op) {
+bool ARMAsmParser::
+ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S, E;
   assert(Parser.getTok().is(AsmToken::LBrac) &&
-         "Token is not an Left Bracket");
+         "Token is not a Left Bracket");
   S = Parser.getTok().getLoc();
   Parser.Lex(); // Eat left bracket token.
 
   const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier))
-    return Error(BaseRegTok.getLoc(), "register expected");
-  if (MaybeParseRegister(Op, false))
-    return Error(BaseRegTok.getLoc(), "register expected");
-  int BaseRegNum = Op->getReg();
+  if (BaseRegTok.isNot(AsmToken::Identifier)) {
+    Error(BaseRegTok.getLoc(), "register expected");
+    return true;
+  }
+  int BaseRegNum = TryParseRegister();
+  if (BaseRegNum == -1) {
+    Error(BaseRegTok.getLoc(), "register expected");
+    return true;
+  }
+
+  // The next token must either be a comma or a closing bracket.
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Comma) && !Tok.is(AsmToken::RBrac))
+    return true;
 
   bool Preindexed = false;
   bool Postindexed = false;
   bool OffsetIsReg = false;
   bool Negative = false;
   bool Writeback = false;
+  ARMOperand *WBOp = 0;
+  int OffsetRegNum = -1;
+  bool OffsetRegShifted = false;
+  enum ShiftType ShiftType = Lsl;
+  const MCExpr *ShiftAmount = 0;
+  const MCExpr *Offset = 0;
 
   // First look for preindexed address forms, that is after the "[Rn" we now
   // have to see if the next token is a comma.
-  const AsmToken &Tok = Parser.getTok();
   if (Tok.is(AsmToken::Comma)) {
     Preindexed = true;
     Parser.Lex(); // Eat comma token.
-    int OffsetRegNum;
-    bool OffsetRegShifted;
-    enum ShiftType ShiftType;
-    const MCExpr *ShiftAmount;
-    const MCExpr *Offset;
-    if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount,
-                            Offset, OffsetIsReg, OffsetRegNum, E))
+
+    if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount,
+                             Offset, OffsetIsReg, OffsetRegNum, E))
       return true;
     const AsmToken &RBracTok = Parser.getTok();
-    if (RBracTok.isNot(AsmToken::RBrac))
-      return Error(RBracTok.getLoc(), "']' expected");
+    if (RBracTok.isNot(AsmToken::RBrac)) {
+      Error(RBracTok.getLoc(), "']' expected");
+      return true;
+    }
     E = RBracTok.getLoc();
     Parser.Lex(); // Eat right bracket token.
 
     const AsmToken &ExclaimTok = Parser.getTok();
     if (ExclaimTok.is(AsmToken::Exclaim)) {
-      E = ExclaimTok.getLoc();
+      WBOp = ARMOperand::CreateToken(ExclaimTok.getString(),
+                                     ExclaimTok.getLoc());
       Writeback = true;
       Parser.Lex(); // Eat exclaim token
     }
-    ARMOperand::CreateMem(Op, BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
-                          OffsetRegShifted, ShiftType, ShiftAmount,
-                          Preindexed, Postindexed, Negative, Writeback, S, E);
-    return false;
-  }
-  // The "[Rn" we have so far was not followed by a comma.
-  else if (Tok.is(AsmToken::RBrac)) {
-    // This is a post indexing addressing forms, that is a ']' follows after
-    // the "[Rn".
-    Postindexed = true;
-    Writeback = true;
+  } else {
+    // The "[Rn" we have so far was not followed by a comma.
+
+    // If there's anything other than the right brace, this is a post indexing
+    // addressing form.
     E = Tok.getLoc();
     Parser.Lex(); // Eat right bracket token.
 
-    int OffsetRegNum = 0;
-    bool OffsetRegShifted = false;
-    enum ShiftType ShiftType;
-    const MCExpr *ShiftAmount;
-    const MCExpr *Offset;
-
     const AsmToken &NextTok = Parser.getTok();
+
     if (NextTok.isNot(AsmToken::EndOfStatement)) {
-      if (NextTok.isNot(AsmToken::Comma))
-        return Error(NextTok.getLoc(), "',' expected");
+      Postindexed = true;
+      Writeback = true;
+
+      if (NextTok.isNot(AsmToken::Comma)) {
+        Error(NextTok.getLoc(), "',' expected");
+        return true;
+      }
+
       Parser.Lex(); // Eat comma token.
-      if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType,
-                              ShiftAmount, Offset, OffsetIsReg, OffsetRegNum, 
-                              E))
+
+      if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType,
+                               ShiftAmount, Offset, OffsetIsReg, OffsetRegNum,
+                               E))
         return true;
     }
+  }
 
-    ARMOperand::CreateMem(Op, BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
-                          OffsetRegShifted, ShiftType, ShiftAmount,
-                          Preindexed, Postindexed, Negative, Writeback, S, E);
-    return false;
+  // Force Offset to exist if used.
+  if (!OffsetIsReg) {
+    if (!Offset)
+      Offset = MCConstantExpr::Create(0, getContext());
   }
 
-  return true;
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset,
+                                           OffsetRegNum, OffsetRegShifted,
+                                           ShiftType, ShiftAmount, Preindexed,
+                                           Postindexed, Negative, Writeback,
+                                           S, E));
+  if (WBOp)
+    Operands.push_back(WBOp);
+
+  return false;
 }
 
 /// Parse the offset of a memory operand after we have seen "[Rn," or "[Rn],"
@@ -543,7 +1171,6 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
                                         bool &OffsetIsReg,
                                         int &OffsetRegNum,
                                         SMLoc &E) {
-  OwningPtr<ARMOperand> Op;
   Negative = false;
   OffsetRegShifted = false;
   OffsetIsReg = false;
@@ -559,13 +1186,15 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
   // See if there is a register following the "[Rn," or "[Rn]," we have so far.
   const AsmToken &OffsetRegTok = Parser.getTok();
   if (OffsetRegTok.is(AsmToken::Identifier)) {
-    OffsetIsReg = !MaybeParseRegister(Op, false);
-    if (OffsetIsReg) {
-      E = Op->getEndLoc();
-      OffsetRegNum = Op->getReg();
+    SMLoc CurLoc = OffsetRegTok.getLoc();
+    OffsetRegNum = TryParseRegister();
+    if (OffsetRegNum != -1) {
+      OffsetIsReg = true;
+      E = CurLoc;
     }
   }
-  // If we parsed a register as the offset then their can be a shift after that
+
+  // If we parsed a register as the offset then there can be a shift after that.
   if (OffsetRegNum != -1) {
     // Look for a comma then a shift
     const AsmToken &Tok = Parser.getTok();
@@ -583,7 +1212,7 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
     const AsmToken &HashTok = Parser.getTok();
     if (HashTok.isNot(AsmToken::Hash))
       return Error(HashTok.getLoc(), "'#' expected");
-    
+
     Parser.Lex(); // Eat hash token.
 
     if (getParser().ParseExpression(Offset))
@@ -597,8 +1226,7 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
 ///   ( lsl | lsr | asr | ror ) , # shift_amount
 ///   rrx
 /// and returns true if it parses a shift otherwise it returns false.
-bool ARMAsmParser::ParseShift(ShiftType &St, 
-                              const MCExpr *&ShiftAmount, 
+bool ARMAsmParser::ParseShift(ShiftType &St, const MCExpr *&ShiftAmount,
                               SMLoc &E) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -636,13 +1264,33 @@ bool ARMAsmParser::ParseShift(ShiftType &St,
 
 /// Parse a arm instruction operand.  For now this parses the operand regardless
 /// of the mnemonic.
-bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) {
+bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                StringRef Mnemonic) {
   SMLoc S, E;
-  
+
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
   switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
   case AsmToken::Identifier:
-    if (!MaybeParseRegister(Op, true))
+    if (!TryParseRegisterWithWriteBack(Operands))
       return false;
+
+    // Fall though for the Identifier case that is not a register or a
+    // special name.
+  case AsmToken::Integer: // things like 1f and 2b as a branch targets
+  case AsmToken::Dot: {   // . as a branch target
     // This was not a register so parse other operands that start with an
     // identifier (like labels) as expressions and create them as immediates.
     const MCExpr *IdVal;
@@ -650,12 +1298,13 @@ bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) {
     if (getParser().ParseExpression(IdVal))
       return true;
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    ARMOperand::CreateImm(Op, IdVal, S, E);
+    Operands.push_back(ARMOperand::CreateImm(IdVal, S, E));
     return false;
+  }
   case AsmToken::LBrac:
-    return ParseMemory(Op);
+    return ParseMemory(Operands);
   case AsmToken::LCurly:
-    return ParseRegisterList(Op);
+    return ParseRegisterList(Operands);
   case AsmToken::Hash:
     // #42 -> immediate.
     // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
@@ -665,28 +1314,134 @@ bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) {
     if (getParser().ParseExpression(ImmVal))
       return true;
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    ARMOperand::CreateImm(Op, ImmVal, S, E);
+    Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
     return false;
-  default:
-    return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+  case AsmToken::Colon: {
+    // ":lower16:" and ":upper16:" expression prefixes
+    // FIXME: Check it's an expression prefix,
+    // e.g. (FOO - :lower16:BAR) isn't legal.
+    ARMMCExpr::VariantKind RefKind;
+    if (ParsePrefix(RefKind))
+      return true;
+
+    const MCExpr *SubExprVal;
+    if (getParser().ParseExpression(SubExprVal))
+      return true;
+
+    const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal,
+                                                   getContext());
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
+    return false;
+  }
   }
 }
 
-/// Parse an arm instruction mnemonic followed by its operands.
-bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  OwningPtr<ARMOperand> Op;
+// ParsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
+//  :lower16: and :upper16:.
+bool ARMAsmParser::ParsePrefix(ARMMCExpr::VariantKind &RefKind) {
+  RefKind = ARMMCExpr::VK_ARM_None;
 
-  // Create the leading tokens for the mnemonic, split by '.' characters.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
+  // :lower16: and :upper16: modifiers
+  assert(getLexer().is(AsmToken::Colon) && "expected a :");
+  Parser.Lex(); // Eat ':'
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "expected prefix identifier in operand");
+    return true;
+  }
+
+  StringRef IDVal = Parser.getTok().getIdentifier();
+  if (IDVal == "lower16") {
+    RefKind = ARMMCExpr::VK_ARM_LO16;
+  } else if (IDVal == "upper16") {
+    RefKind = ARMMCExpr::VK_ARM_HI16;
+  } else {
+    Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
+    return true;
+  }
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Colon)) {
+    Error(Parser.getTok().getLoc(), "unexpected token after prefix");
+    return true;
+  }
+  Parser.Lex(); // Eat the last ':'
+  return false;
+}
+
+const MCExpr *
+ARMAsmParser::ApplyPrefixToExpr(const MCExpr *E,
+                                MCSymbolRefExpr::VariantKind Variant) {
+  // Recurse over the given expression, rebuilding it to apply the given variant
+  // to the leftmost symbol.
+  if (Variant == MCSymbolRefExpr::VK_None)
+    return E;
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle target expr yet");
+  case MCExpr::Constant:
+    llvm_unreachable("Can't handle lower16/upper16 of constant yet");
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
 
-  // Determine the predicate, if any.
+    if (SRE->getKind() != MCSymbolRefExpr::VK_None)
+      return 0;
+
+    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, getContext());
+  }
+
+  case MCExpr::Unary:
+    llvm_unreachable("Can't handle unary expressions yet");
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    const MCExpr *LHS = ApplyPrefixToExpr(BE->getLHS(), Variant);
+    const MCExpr *RHS = BE->getRHS();
+    if (!LHS)
+      return 0;
+
+    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext());
+  }
+  }
+
+  assert(0 && "Invalid expression kind!");
+  return 0;
+}
+
+/// \brief Given a mnemonic, split out possible predication code and carry
+/// setting letters to form a canonical mnemonic and flags.
+//
+// FIXME: Would be nice to autogen this.
+static StringRef SplitMnemonic(StringRef Mnemonic,
+                               unsigned &PredicationCode,
+                               bool &CarrySetting,
+                               unsigned &ProcessorIMod) {
+  PredicationCode = ARMCC::AL;
+  CarrySetting = false;
+  ProcessorIMod = 0;
+
+  // Ignore some mnemonics we know aren't predicated forms.
   //
-  // FIXME: We need a way to check whether a prefix supports predication,
-  // otherwise we will end up with an ambiguity for instructions that happen to
-  // end with a predicate name.
-  unsigned CC = StringSwitch<unsigned>(Head.substr(Head.size()-2))
+  // FIXME: Would be nice to autogen this.
+  if (Mnemonic == "teq" || Mnemonic == "vceq" ||
+      Mnemonic == "movs" ||
+      Mnemonic == "svc" ||
+      (Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" ||
+       Mnemonic == "vmls" || Mnemonic == "vnmls") ||
+      Mnemonic == "vacge" || Mnemonic == "vcge" ||
+      Mnemonic == "vclt" ||
+      Mnemonic == "vacgt" || Mnemonic == "vcgt" ||
+      Mnemonic == "vcle" ||
+      (Mnemonic == "smlal" || Mnemonic == "umaal" || Mnemonic == "umlal" ||
+       Mnemonic == "vabal" || Mnemonic == "vmlal" || Mnemonic == "vpadal" ||
+       Mnemonic == "vqdmlal"))
+    return Mnemonic;
+
+  // First, split out any predication code.
+  unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2))
     .Case("eq", ARMCC::EQ)
     .Case("ne", ARMCC::NE)
     .Case("hs", ARMCC::HS)
@@ -704,44 +1459,268 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
     .Case("al", ARMCC::AL)
     .Default(~0U);
   if (CC != ~0U) {
-    Head = Head.slice(0, Head.size() - 2);
-  } else
-    CC = ARMCC::AL;
+    Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2);
+    PredicationCode = CC;
+  }
+
+  // Next, determine if we have a carry setting bit. We explicitly ignore all
+  // the instructions we know end in 's'.
+  if (Mnemonic.endswith("s") &&
+      !(Mnemonic == "asrs" || Mnemonic == "cps" || Mnemonic == "mls" ||
+        Mnemonic == "movs" || Mnemonic == "mrs" || Mnemonic == "smmls" ||
+        Mnemonic == "vabs" || Mnemonic == "vcls" || Mnemonic == "vmls" ||
+        Mnemonic == "vmrs" || Mnemonic == "vnmls" || Mnemonic == "vqabs" ||
+        Mnemonic == "vrecps" || Mnemonic == "vrsqrts")) {
+    Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
+    CarrySetting = true;
+  }
+
+  // The "cps" instruction can have a interrupt mode operand which is glued into
+  // the mnemonic. Check if this is the case, split it and parse the imod op
+  if (Mnemonic.startswith("cps")) {
+    // Split out any imod code.
+    unsigned IMod =
+      StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2))
+      .Case("ie", ARM_PROC::IE)
+      .Case("id", ARM_PROC::ID)
+      .Default(~0U);
+    if (IMod != ~0U) {
+      Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2);
+      ProcessorIMod = IMod;
+    }
+  }
+
+  return Mnemonic;
+}
+
+/// \brief Given a canonical mnemonic, determine if the instruction ever allows
+/// inclusion of carry set or predication code operands.
+//
+// FIXME: It would be nice to autogen this.
+void ARMAsmParser::
+GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+                      bool &CanAcceptPredicationCode) {
+  bool isThumb = TM.getSubtarget<ARMSubtarget>().isThumb();
+
+  if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+      Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
+      Mnemonic == "smull" || Mnemonic == "add" || Mnemonic == "adc" ||
+      Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" ||
+      Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mov" ||
+      Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
+      Mnemonic == "sbc" || Mnemonic == "mla" || Mnemonic == "umull" ||
+      Mnemonic == "eor" || Mnemonic == "smlal" || Mnemonic == "mvn") {
+    CanAcceptCarrySet = true;
+  } else {
+    CanAcceptCarrySet = false;
+  }
+
+  if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" ||
+      Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" ||
+      Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" ||
+      Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" ||
+      Mnemonic == "dsb" || Mnemonic == "movs" || Mnemonic == "isb" ||
+      Mnemonic == "clrex" || Mnemonic.startswith("cps")) {
+    CanAcceptPredicationCode = false;
+  } else {
+    CanAcceptPredicationCode = true;
+  }
 
-  ARMOperand::CreateToken(Op, Head, NameLoc);
-  Operands.push_back(Op.take());
+  if (isThumb)
+    if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
+        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
+      CanAcceptPredicationCode = false;
+}
+
+/// Parse an arm instruction mnemonic followed by its operands.
+bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // Split out the predication code and carry setting flag from the mnemonic.
+  unsigned PredicationCode;
+  unsigned ProcessorIMod;
+  bool CarrySetting;
+  Head = SplitMnemonic(Head, PredicationCode, CarrySetting,
+                       ProcessorIMod);
+
+  Operands.push_back(ARMOperand::CreateToken(Head, NameLoc));
+
+  // Next, add the CCOut and ConditionCode operands, if needed.
+  //
+  // For mnemonics which can ever incorporate a carry setting bit or predication
+  // code, our matching model involves us always generating CCOut and
+  // ConditionCode operands to match the mnemonic "as written" and then we let
+  // the matcher deal with finding the right instruction or generating an
+  // appropriate error.
+  bool CanAcceptCarrySet, CanAcceptPredicationCode;
+  GetMnemonicAcceptInfo(Head, CanAcceptCarrySet, CanAcceptPredicationCode);
+
+  // Add the carry setting operand, if necessary.
+  //
+  // FIXME: It would be awesome if we could somehow invent a location such that
+  // match errors on this operand would print a nice diagnostic about how the
+  // 's' character in the mnemonic resulted in a CCOut operand.
+  if (CanAcceptCarrySet) {
+    Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
+                                               NameLoc));
+  } else {
+    // This mnemonic can't ever accept a carry set, but the user wrote one (or
+    // misspelled another mnemonic).
+
+    // FIXME: Issue a nice error.
+  }
+
+  // Add the predication code operand, if necessary.
+  if (CanAcceptPredicationCode) {
+    Operands.push_back(ARMOperand::CreateCondCode(
+                         ARMCC::CondCodes(PredicationCode), NameLoc));
+  } else {
+    // This mnemonic can't ever accept a predication code, but the user wrote
+    // one (or misspelled another mnemonic).
+
+    // FIXME: Issue a nice error.
+  }
+
+  // Add the processor imod operand, if necessary.
+  if (ProcessorIMod) {
+    Operands.push_back(ARMOperand::CreateImm(
+          MCConstantExpr::Create(ProcessorIMod, getContext()),
+                                 NameLoc, NameLoc));
+  } else {
+    // This mnemonic can't ever accept a imod, but the user wrote
+    // one (or misspelled another mnemonic).
 
-  ARMOperand::CreateCondCode(Op, ARMCC::CondCodes(CC), NameLoc);
-  Operands.push_back(Op.take());
+    // FIXME: Issue a nice error.
+  }
 
   // Add the remaining tokens in the mnemonic.
   while (Next != StringRef::npos) {
     Start = Next;
     Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start, Next);
+    StringRef ExtraToken = Name.slice(Start, Next);
 
-    ARMOperand::CreateToken(Op, Head, NameLoc);
-    Operands.push_back(Op.take());
+    Operands.push_back(ARMOperand::CreateToken(ExtraToken, NameLoc));
   }
 
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    OwningPtr<ARMOperand> Op;
-    if (ParseOperand(Op)) return true;
-    Operands.push_back(Op.take());
+    if (ParseOperand(Operands, Head)) {
+      Parser.EatToEndOfStatement();
+      return true;
+    }
 
     while (getLexer().is(AsmToken::Comma)) {
       Parser.Lex();  // Eat the comma.
 
       // Parse and remember the operand.
-      if (ParseOperand(Op)) return true;
-      Operands.push_back(Op.take());
+      if (ParseOperand(Operands, Head)) {
+        Parser.EatToEndOfStatement();
+        return true;
+      }
     }
   }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Parser.EatToEndOfStatement();
+    return TokError("unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
   return false;
 }
 
+bool ARMAsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out) {
+  MCInst Inst;
+  unsigned ErrorInfo;
+  MatchResultTy MatchResult, MatchResult2;
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo);
+  if (MatchResult != Match_Success) {
+    // If we get a Match_InvalidOperand it might be some arithmetic instruction
+    // that does not update the condition codes.  So try adding a CCOut operand
+    // with a value of reg0.
+    if (MatchResult == Match_InvalidOperand) {
+      Operands.insert(Operands.begin() + 1,
+                      ARMOperand::CreateCCOut(0,
+                                  ((ARMOperand*)Operands[0])->getStartLoc()));
+      MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo);
+      if (MatchResult2 == Match_Success)
+        MatchResult = Match_Success;
+      else {
+        ARMOperand *CCOut = ((ARMOperand*)Operands[1]);
+        Operands.erase(Operands.begin() + 1);
+        delete CCOut;
+      }
+    }
+    // If we get a Match_MnemonicFail it might be some arithmetic instruction
+    // that updates the condition codes if it ends in 's'.  So see if the
+    // mnemonic ends in 's' and if so try removing the 's' and adding a CCOut
+    // operand with a value of CPSR.
+    else if(MatchResult == Match_MnemonicFail) {
+      // Get the instruction mnemonic, which is the first token.
+      StringRef Mnemonic = ((ARMOperand*)Operands[0])->getToken();
+      if (Mnemonic.substr(Mnemonic.size()-1) == "s") {
+        // removed the 's' from the mnemonic for matching.
+        StringRef MnemonicNoS = Mnemonic.slice(0, Mnemonic.size() - 1);
+        SMLoc NameLoc = ((ARMOperand*)Operands[0])->getStartLoc();
+        ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]);
+        Operands.erase(Operands.begin());
+        delete OldMnemonic;
+        Operands.insert(Operands.begin(),
+                        ARMOperand::CreateToken(MnemonicNoS, NameLoc));
+        Operands.insert(Operands.begin() + 1,
+                        ARMOperand::CreateCCOut(ARM::CPSR, NameLoc));
+        MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo);
+        if (MatchResult2 == Match_Success)
+          MatchResult = Match_Success;
+        else {
+          ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]);
+          Operands.erase(Operands.begin());
+          delete OldMnemonic;
+          Operands.insert(Operands.begin(),
+                          ARMOperand::CreateToken(Mnemonic, NameLoc));
+          ARMOperand *CCOut = ((ARMOperand*)Operands[1]);
+          Operands.erase(Operands.begin() + 1);
+          delete CCOut;
+        }
+      }
+    }
+  }
+  switch (MatchResult) {
+  case Match_Success:
+    Out.EmitInstruction(Inst);
+    return false;
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_ConversionFail:
+    return Error(IDLoc, "unable to convert operands to instruction");
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
 /// ParseDirective parses the arm specific directives
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
@@ -771,7 +1750,7 @@ bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
-      
+
       // FIXME: Improve diagnostic.
       if (getLexer().isNot(AsmToken::Comma))
         return Error(L, "unexpected token in directive");
@@ -801,16 +1780,16 @@ bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
 bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
-    return Error(L, "unexpected token in .syntax directive");
-  StringRef ATTRIBUTE_UNUSED SymbolName = Parser.getTok().getIdentifier();
+    return Error(L, "unexpected token in .thumb_func directive");
+  StringRef Name = Tok.getString();
   Parser.Lex(); // Consume the identifier token.
-
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
-  // TODO: mark symbol as a thumb symbol
-  // getParser().getStreamer().Emit???();
+  // Mark symbol as a thumb symbol.
+  MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name);
+  getParser().getStreamer().EmitThumbFunc(Func);
   return false;
 }
 
@@ -824,7 +1803,7 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
   if (Mode == "unified" || Mode == "UNIFIED")
     Parser.Lex();
   else if (Mode == "divided" || Mode == "DIVIDED")
-    Parser.Lex();
+    return Error(L, "'.syntax divided' arm asssembly not supported");
   else
     return Error(L, "unrecognized syntax mode in .syntax directive");
 
@@ -855,8 +1834,21 @@ bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
     return Error(Parser.getTok().getLoc(), "unexpected token in directive");
   Parser.Lex();
 
-  // TODO tell the MC streamer the mode
-  // getParser().getStreamer().Emit???();
+  // FIXME: We need to be able switch subtargets at this point so that
+  // MatchInstructionImpl() will work when it gets the AvailableFeatures which
+  // includes Feature_IsThumb or not to match the right instructions.  This is
+  // blocked on the FIXME in llvm-mc.cpp when creating the TargetMachine.
+  if (Val == 16){
+    assert(TM.getSubtarget<ARMSubtarget>().isThumb() &&
+	   "switching between arm/thumb not yet suppported via .code 16)");
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  }
+  else{
+    assert(!TM.getSubtarget<ARMSubtarget>().isThumb() &&
+           "switching between thumb/arm not yet suppported via .code 32)");
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+   }
+
   return false;
 }
 
@@ -869,4 +1861,6 @@ extern "C" void LLVMInitializeARMAsmParser() {
   LLVMInitializeARMAsmLexer();
 }
 
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
 #include "ARMGenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index e220289..78d73d3 100644
--- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -39,9 +39,9 @@
 /// o static uint16_t decodeThumbInstruction(field_t insn) - the decoding
 /// function for a Thumb instruction.
 ///
-#include "../ARMGenDecoderTables.inc"
+#include "ARMGenDecoderTables.inc"
 
-#include "../ARMGenEDInfo.inc"
+#include "ARMGenEDInfo.inc"
 
 using namespace llvm;
 
@@ -89,7 +89,8 @@ static unsigned decodeARMInstruction(uint32_t &insn) {
       return ARM::BFI;
   }
 
-  // Ditto for STRBT, which is a super-instruction for A8.6.199 Encoding A1 & A2.
+  // Ditto for STRBT, which is a super-instruction for A8.6.199 Encodings
+  // A1 & A2.
   // As a result, the decoder fails to deocode USAT properly.
   if (slice(insn, 27, 21) == 0x37 && slice(insn, 5, 4) == 1)
     return ARM::USAT;
@@ -252,9 +253,6 @@ static unsigned T2Morph2LoadLiteral(unsigned Opcode) {
   default:
     return Opcode; // Return unmorphed opcode.
 
-  case ARM::t2LDRDi8:
-    return ARM::t2LDRDpci;
-
   case ARM::t2LDR_POST:   case ARM::t2LDR_PRE:
   case ARM::t2LDRi12:     case ARM::t2LDRi8:
   case ARM::t2LDRs:       case ARM::t2LDRT:
@@ -349,36 +347,6 @@ static unsigned decodeThumbSideEffect(bool IsThumb2, unsigned &insn) {
   return decodeThumbInstruction(insn);
 }
 
-static inline bool Thumb2PreloadOpcodeNoPCI(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2PLDi12:   case ARM::t2PLDi8:
-  case ARM::t2PLDr:     case ARM::t2PLDs:
-  case ARM::t2PLDWi12:  case ARM::t2PLDWi8:
-  case ARM::t2PLDWr:    case ARM::t2PLDWs:
-  case ARM::t2PLIi12:   case ARM::t2PLIi8:
-  case ARM::t2PLIr:     case ARM::t2PLIs:
-    return true;
-  }
-}
-
-static inline unsigned T2Morph2Preload2PCI(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return 0;
-  case ARM::t2PLDi12:   case ARM::t2PLDi8:
-  case ARM::t2PLDr:     case ARM::t2PLDs:
-    return ARM::t2PLDpci;
-  case ARM::t2PLDWi12:  case ARM::t2PLDWi8:
-  case ARM::t2PLDWr:    case ARM::t2PLDWs:
-    return ARM::t2PLDWpci;
-  case ARM::t2PLIi12:   case ARM::t2PLIi8:
-  case ARM::t2PLIr:     case ARM::t2PLIs:
-    return ARM::t2PLIpci;
-  }
-}
-
 //
 // Public interface for the disassembler
 //
@@ -485,11 +453,6 @@ bool ThumbDisassembler::getInstruction(MCInst &MI,
   // instructions as well.
   unsigned Opcode = decodeThumbSideEffect(IsThumb2, insn);
 
-  // A8.6.117/119/120/121.
-  // PLD/PLDW/PLI instructions with Rn==15 is transformed to the pci variant.
-  if (Thumb2PreloadOpcodeNoPCI(Opcode) && slice(insn, 19, 16) == 15)
-    Opcode = T2Morph2Preload2PCI(Opcode);
-
   ARMFormat Format = ARMFormats[Opcode];
   Size = IsThumb2 ? 4 : 2;
 
@@ -568,9 +531,9 @@ static MCDisassembler *createThumbDisassembler(const Target &T) {
   return new ThumbDisassembler;
 }
 
-extern "C" void LLVMInitializeARMDisassembler() { 
+extern "C" void LLVMInitializeARMDisassembler() {
   // Register the disassembler.
-  TargetRegistry::RegisterMCDisassembler(TheARMTarget, 
+  TargetRegistry::RegisterMCDisassembler(TheARMTarget,
                                          createARMDisassembler);
   TargetRegistry::RegisterMCDisassembler(TheThumbTarget,
                                          createThumbDisassembler);
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
index 9f493b9..bac68dd 100644
--- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
@@ -79,22 +79,9 @@ const char *ARMUtils::OpcodeName(unsigned Opcode) {
 }
 
 // Return the register enum Based on RegClass and the raw register number.
-// For DRegPair, see comments below.
 // FIXME: Auto-gened?
-static unsigned getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister,
-                                bool DRegPair = false) {
-
-  if (DRegPair && RegClassID == ARM::QPRRegClassID) {
-    // LLVM expects { Dd, Dd+1 } to form a super register; this is not specified
-    // in the ARM Architecture Manual as far as I understand it (A8.6.307).
-    // Therefore, we morph the RegClassID to be the sub register class and don't
-    // subsequently transform the RawRegister encoding when calculating RegNum.
-    //
-    // See also ARMinstPrinter::printOperand() wrt "dregpair" modifier part
-    // where this workaround is meant for.
-    RegClassID = ARM::DPRRegClassID;
-  }
-
+static unsigned
+getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister) {
   // For this purpose, we can treat rGPR as if it were GPR.
   if (RegClassID == ARM::rGPRRegClassID) RegClassID = ARM::GPRRegClassID;
 
@@ -704,8 +691,8 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn,
 // MSR/MSRsys: Rm mask=Inst{19-16}
 // BXJ:        Rm
 // MSRi/MSRsysi: so_imm
-// SRSW/SRS: addrmode4:$addr mode_imm
-// RFEW/RFE: addrmode4:$addr Rn
+// SRSW/SRS: ldstm_mode:$amode mode_imm
+// RFEW/RFE: ldstm_mode:$amode Rn
 static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -733,35 +720,34 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     NumOpsAdded = 1;
     return true;
   }
-  // MSR and MSRsys take one GPR reg Rm, followed by the mask.
-  if (Opcode == ARM::MSR || Opcode == ARM::MSRsys) {
-    assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID &&
+  // MSR take a mask, followed by one GPR reg Rm. The mask contains the R Bit in
+  // bit 4, and the special register fields in bits 3-0.
+  if (Opcode == ARM::MSR) {
+    assert(NumOps >= 1 && OpInfo[1].RegClass == ARM::GPRRegClassID &&
            "Reg operand expected");
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ |
+                                       slice(insn, 19, 16) /* Special Reg */ ));
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16)));
     NumOpsAdded = 2;
     return true;
   }
-  // MSRi and MSRsysi take one so_imm operand, followed by the mask.
-  if (Opcode == ARM::MSRi || Opcode == ARM::MSRsysi) {
+  // MSRi take a mask, followed by one so_imm operand. The mask contains the
+  // R Bit in bit 4, and the special register fields in bits 3-0.
+  if (Opcode == ARM::MSRi) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ |
+                                       slice(insn, 19, 16) /* Special Reg */ ));
     // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0.
     // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}.
     // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot().
     unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF;
     unsigned Imm = insn & 0xFF;
     MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot)));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16)));
     NumOpsAdded = 2;
     return true;
   }
-  // SRSW and SRS requires addrmode4:$addr for ${addr:submode}, followed by the
-  // mode immediate (Inst{4-0}).
   if (Opcode == ARM::SRSW || Opcode == ARM::SRS ||
       Opcode == ARM::RFEW || Opcode == ARM::RFE) {
-    // ARMInstPrinter::printAddrMode4Operand() prints special mode string
-    // if the base register is SP; so don't set ARM::SP.
-    MI.addOperand(MCOperand::CreateReg(0));
     ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn));
     MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode)));
 
@@ -807,9 +793,8 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 }
 
 // Misc. Branch Instructions.
-// BR_JTadd, BR_JTr, BR_JTm
 // BLXr9, BXr9
-// BRIND, BX_RET
+// BX, BX_RET
 static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -820,12 +805,12 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   OpIdx = 0;
 
-  // BX_RET has only two predicate operands, do an early return.
-  if (Opcode == ARM::BX_RET)
+  // BX_RET and MOVPCLR have only two predicate operands; do an early return.
+  if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR)
     return true;
 
-  // BLXr9 and BRIND take one GPR reg.
-  if (Opcode == ARM::BLXr9 || Opcode == ARM::BRIND) {
+  // BLXr9 and BX take one GPR reg.
+  if (Opcode == ARM::BLXr9 || Opcode == ARM::BX) {
     assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
            "Reg operand expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -834,72 +819,6 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return true;
   }
 
-  // BR_JTadd is an ADD with Rd = PC, (Rn, Rm) as the target and index regs.
-  if (Opcode == ARM::BR_JTadd) {
-    // InOperandList with GPR:$target and GPR:$idx regs.
-
-    assert(NumOps == 4 && "Expect 4 operands");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-
-    // Fill in the two remaining imm operands to signify build completion.
-    MI.addOperand(MCOperand::CreateImm(0));
-    MI.addOperand(MCOperand::CreateImm(0));
-
-    OpIdx = 4;
-    return true;
-  }
-
-  // BR_JTr is a MOV with Rd = PC, and Rm as the source register.
-  if (Opcode == ARM::BR_JTr) {
-    // InOperandList with GPR::$target reg.
-
-    assert(NumOps == 3 && "Expect 3 operands");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-
-    // Fill in the two remaining imm operands to signify build completion.
-    MI.addOperand(MCOperand::CreateImm(0));
-    MI.addOperand(MCOperand::CreateImm(0));
-
-    OpIdx = 3;
-    return true;
-  }
-
-  // BR_JTm is an LDR with Rt = PC.
-  if (Opcode == ARM::BR_JTm) {
-    // This is the reg/reg form, with base reg followed by +/- reg shop imm.
-    // See also ARMAddressingModes.h (Addressing Mode #2).
-
-    assert(NumOps == 5 && getIBit(insn) == 1 && "Expect 5 operands && I-bit=1");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-
-    ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-
-    // Disassemble the offset reg (Rm), shift type, and immediate shift length.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShImm = slice(insn, 11, 7);
-
-    // A8.4.1.  Possible rrx or shift amount of 32...
-    getImmShiftSE(ShOp, ShImm);
-    MI.addOperand(MCOperand::CreateImm(
-                    ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp)));
-
-    // Fill in the two remaining imm operands to signify build completion.
-    MI.addOperand(MCOperand::CreateImm(0));
-    MI.addOperand(MCOperand::CreateImm(0));
-
-    OpIdx = 5;
-    return true;
-  }
-
   return false;
 }
 
@@ -1324,30 +1243,28 @@ static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   assert(NumOps >= 5 && "LdStMulFrm expects NumOps >= 5");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
+  NumOpsAdded = 0;
 
   unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
 
   // Writeback to base, if necessary.
-  if (Opcode == ARM::LDM_UPD || Opcode == ARM::STM_UPD) {
+  if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::STMIA_UPD ||
+      Opcode == ARM::LDMDA_UPD || Opcode == ARM::STMDA_UPD ||
+      Opcode == ARM::LDMDB_UPD || Opcode == ARM::STMDB_UPD ||
+      Opcode == ARM::LDMIB_UPD || Opcode == ARM::STMIB_UPD) {
     MI.addOperand(MCOperand::CreateReg(Base));
-    ++OpIdx;
+    ++NumOpsAdded;
   }
 
+  // Add the base register operand.
   MI.addOperand(MCOperand::CreateReg(Base));
 
-  ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn));
-  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode)));
-
   // Handling the two predicate operands before the reglist.
   int64_t CondVal = insn >> ARMII::CondShift;
   MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal));
   MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
 
-  OpIdx += 4;
+  NumOpsAdded += 3;
 
   // Fill the variadic part of reglist.
   unsigned RegListBits = insn & ((1 << 16) - 1);
@@ -1355,7 +1272,7 @@ static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     if ((RegListBits >> i) & 1) {
       MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                          i)));
-      ++OpIdx;
+      ++NumOpsAdded;
     }
   }
 
@@ -1586,8 +1503,7 @@ static unsigned decodeVFPRm(uint32_t insn, bool isSPVFP) {
 }
 
 // A7.5.1
-#if 0
-static uint64_t VFPExpandImm(unsigned char byte, unsigned N) {
+static APInt VFPExpandImm(unsigned char byte, unsigned N) {
   assert(N == 32 || N == 64);
 
   uint64_t Result;
@@ -1602,13 +1518,12 @@ static uint64_t VFPExpandImm(unsigned char byte, unsigned N) {
     Result = (uint64_t)slice(byte, 7, 7) << 63 |
              (uint64_t)slice(byte, 5, 0) << 48;
     if (bit6)
-      Result |= 0xffL << 54;
+      Result |= 0xffULL << 54;
     else
-      Result |= 0x1L << 62;
+      Result |= 0x1ULL << 62;
   }
-  return Result;
+  return APInt(N, Result);
 }
-#endif
 
 // VFP Unary Format Instructions:
 //
@@ -1902,8 +1817,10 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
 
   // Writeback to base, if necessary.
-  if (Opcode == ARM::VLDMD_UPD || Opcode == ARM::VLDMS_UPD ||
-      Opcode == ARM::VSTMD_UPD || Opcode == ARM::VSTMS_UPD) {
+  if (Opcode == ARM::VLDMDIA_UPD || Opcode == ARM::VLDMSIA_UPD ||
+      Opcode == ARM::VLDMDDB_UPD || Opcode == ARM::VLDMSDB_UPD ||
+      Opcode == ARM::VSTMDIA_UPD || Opcode == ARM::VSTMSIA_UPD ||
+      Opcode == ARM::VSTMDDB_UPD || Opcode == ARM::VSTMSDB_UPD) {
     MI.addOperand(MCOperand::CreateReg(Base));
     ++OpIdx;
   }
@@ -1926,8 +1843,10 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   OpIdx += 4;
 
-  bool isSPVFP = (Opcode == ARM::VLDMS || Opcode == ARM::VLDMS_UPD ||
-                  Opcode == ARM::VSTMS || Opcode == ARM::VSTMS_UPD);
+  bool isSPVFP = (Opcode == ARM::VLDMSIA     || Opcode == ARM::VLDMSDB     ||
+                  Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMSDB_UPD ||
+                  Opcode == ARM::VSTMSIA     || Opcode == ARM::VSTMSDB     ||
+                  Opcode == ARM::VSTMSIA_UPD || Opcode == ARM::VSTMSDB_UPD);
   unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
 
   // Extract Dd/Sd.
@@ -1985,10 +1904,14 @@ static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   // Extract/decode the f64/f32 immediate.
   if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
         && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // The asm syntax specifies the before-expanded <imm>.
-    // Not VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0),
-    //                  Opcode == ARM::FCONSTD ? 64 : 32)
-    MI.addOperand(MCOperand::CreateImm(slice(insn,19,16)<<4 | slice(insn,3,0)));
+    // The asm syntax specifies the floating point value, not the 8-bit literal.
+    APInt immRaw = VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0),
+                             Opcode == ARM::FCONSTD ? 64 : 32);
+    APFloat immFP = APFloat(immRaw, true);
+    double imm = Opcode == ARM::FCONSTD ? immFP.convertToDouble() :
+      immFP.convertToFloat();
+    MI.addOperand(MCOperand::CreateFPImm(imm));
+
     ++OpIdx;
   }
 
@@ -2201,22 +2124,6 @@ static unsigned decodeN3VImm(uint32_t insn) {
   return (insn >> 8) & 0xF;
 }
 
-static bool UseDRegPair(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::VLD1q8_UPD:
-  case ARM::VLD1q16_UPD:
-  case ARM::VLD1q32_UPD:
-  case ARM::VLD1q64_UPD:
-  case ARM::VST1q8_UPD:
-  case ARM::VST1q16_UPD:
-  case ARM::VST1q32_UPD:
-  case ARM::VST1q64_UPD:
-    return true;
-  }
-}
-
 // VLD*
 //   D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm]
 // VLD*LN*
@@ -2243,10 +2150,9 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // We have homogeneous NEON registers for Load/Store.
   unsigned RegClass = 0;
-  bool DRegPair = UseDRegPair(Opcode);
 
   // Double-spaced registers have increments of 2.
-  unsigned Inc = (DblSpaced || DRegPair) ? 2 : 1;
+  unsigned Inc = DblSpaced ? 2 : 1;
 
   unsigned Rn = decodeRn(insn);
   unsigned Rm = decodeRm(insn);
@@ -2292,7 +2198,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     RegClass = OpInfo[OpIdx].RegClass;
     while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd, DRegPair)));
+                      getRegisterEnum(B, RegClass, Rd)));
       Rd += Inc;
       ++OpIdx;
     }
@@ -2311,7 +2217,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
 
     while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd, DRegPair)));
+                      getRegisterEnum(B, RegClass, Rd)));
       Rd += Inc;
       ++OpIdx;
     }
@@ -2771,8 +2677,8 @@ static bool DisassembleN3RegVecShFrm(MCInst &MI, unsigned Opcode,
   return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
                                   N3V_VectorShift, B);
 }
-static bool DisassembleNVecExtractFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
+static bool DisassembleNVecExtractFrm(MCInst &MI, unsigned Opcode,
+    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
                                   N3V_VectorExtract, B);
@@ -2959,9 +2865,9 @@ static inline bool MemBarrierInstr(uint32_t insn) {
 
 static inline bool PreLoadOpcode(unsigned Opcode) {
   switch(Opcode) {
-  case ARM::PLDi:  case ARM::PLDr:
-  case ARM::PLDWi: case ARM::PLDWr:
-  case ARM::PLIi:  case ARM::PLIr:
+  case ARM::PLDi12:  case ARM::PLDrs:
+  case ARM::PLDWi12: case ARM::PLDWrs:
+  case ARM::PLIi12:  case ARM::PLIrs:
     return true;
   default:
     return false;
@@ -2971,18 +2877,21 @@ static inline bool PreLoadOpcode(unsigned Opcode) {
 static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  // Preload Data/Instruction requires either 2 or 4 operands.
-  // PLDi, PLDWi, PLIi:                Rn [+/-]imm12 add = (U == '1')
-  // PLDr[a|m], PLDWr[a|m], PLIr[a|m]: Rn Rm addrmode2_opc
+  // Preload Data/Instruction requires either 2 or 3 operands.
+  // PLDi, PLDWi, PLIi:                addrmode_imm12
+  // PLDr[a|m], PLDWr[a|m], PLIr[a|m]: ldst_so_reg
 
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                      decodeRn(insn))));
 
-  if (Opcode == ARM::PLDi || Opcode == ARM::PLDWi || Opcode == ARM::PLIi) {
+  if (Opcode == ARM::PLDi12 || Opcode == ARM::PLDWi12
+      || Opcode == ARM::PLIi12) {
     unsigned Imm12 = slice(insn, 11, 0);
     bool Negative = getUBit(insn) == 0;
-    int Offset = Negative ? -1 - Imm12 : 1 * Imm12;
-    MI.addOperand(MCOperand::CreateImm(Offset));
+    // -0 is represented specially. All other values are as normal.
+    if (Imm12 == 0 && Negative)
+      Imm12 = INT32_MIN;
+    MI.addOperand(MCOperand::CreateImm(Imm12));
     NumOpsAdded = 2;
   } else {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -3026,22 +2935,36 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   case ARM::WFE:
   case ARM::WFI:
   case ARM::SEV:
-  case ARM::SETENDBE:
-  case ARM::SETENDLE:
     return true;
   default:
     break;
   }
 
-  // CPS has a singleton $opt operand that contains the following information:
-  // opt{4-0} = mode from Inst{4-0}
-  // opt{5} = changemode from Inst{17}
-  // opt{8-6} = AIF from Inst{8-6}
-  // opt{10-9} = imod from Inst{19-18} with 0b10 as enable and 0b11 as disable
-  if (Opcode == ARM::CPS) {
-    unsigned Option = slice(insn, 4, 0) | slice(insn, 17, 17) << 5 |
-      slice(insn, 8, 6) << 6 | slice(insn, 19, 18) << 9;
-    MI.addOperand(MCOperand::CreateImm(Option));
+  if (Opcode == ARM::SETEND) {
+    NumOpsAdded = 1;
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 9, 9)));
+    return true;
+  }
+
+  // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different
+  // opcodes which match the same real instruction. This is needed since there's
+  // no current handling of optional arguments. Fix here when a better handling
+  // of optional arguments is implemented.
+  if (Opcode == ARM::CPS3p) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));   // mode
+    NumOpsAdded = 3;
+    return true;
+  }
+  if (Opcode == ARM::CPS2p) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
+    NumOpsAdded = 2;
+    return true;
+  }
+  if (Opcode == ARM::CPS1p) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode
     NumOpsAdded = 1;
     return true;
   }
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
index 112817b..23372e0 100644
--- a/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
@@ -564,6 +564,38 @@ static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn,
 // t_addrmode_sp := sp + imm8 * 4
 //
 
+// A8.6.63 LDRB (literal)
+// A8.6.79 LDRSB (literal)
+// A8.6.75 LDRH (literal)
+// A8.6.83 LDRSH (literal)
+// A8.6.59 LDR (literal)
+//
+// These instrs calculate an address from the PC value and an immediate offset.
+// Rd Rn=PC (+/-)imm12 (+ if Inst{23} == 0b1)
+static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
+    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
+
+  const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
+  if (!OpInfo) return false;
+
+  assert(NumOps >= 2 &&
+         OpInfo[0].RegClass == ARM::GPRRegClassID &&
+         OpInfo[1].RegClass < 0 &&
+         "Expect >= 2 operands, first as reg, and second as imm operand");
+
+  // Build the register operand, followed by the (+/-)imm12 immediate.
+
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+                                                     decodeRd(insn))));
+
+  MI.addOperand(MCOperand::CreateImm(decodeImm12(insn)));
+
+  NumOpsAdded = 2;
+
+  return true;
+}
+
+
 // A6.2.4 Load/store single data item
 //
 // Load/Store Register (reg|imm):      tRd tRn imm5 tRm
@@ -796,14 +828,13 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
   }
 
   // CPS has a singleton $opt operand that contains the following information:
-  // opt{4-0} = don't care
-  // opt{5} = 0 (false)
-  // opt{8-6} = AIF from Inst{2-0}
-  // opt{10-9} = 1:imod from Inst{4} with 0b10 as enable and 0b11 as disable
+  // The first op would be 0b10 as enable and 0b11 as disable in regular ARM,
+  // but in Thumb it's is 0 as enable and 1 as disable. So map it to ARM's
+  // default one. The second get the AIF flags from Inst{2-0}.
   if (Opcode == ARM::tCPS) {
-    unsigned Option = slice(insn, 2, 0) << 6 | slice(insn, 4, 4) << 9 | 1 << 10;
-    MI.addOperand(MCOperand::CreateImm(Option));
-    NumOpsAdded = 1;
+    MI.addOperand(MCOperand::CreateImm(2 + slice(insn, 4, 4)));
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 2, 0)));
+    NumOpsAdded = 2;
     return true;
   }
 
@@ -833,40 +864,32 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
 // A8.6.53  LDM / LDMIA
 // A8.6.189 STM / STMIA
 //
-// tLDM_UPD/tSTM_UPD: tRt tRt AM4ModeImm Pred-Imm Pred-CCR register_list
-// tLDM:              tRt AM4ModeImm Pred-Imm Pred-CCR register_list
+// tLDMIA_UPD/tSTMIA_UPD: tRt tRt AM4ModeImm Pred-Imm Pred-CCR register_list
+// tLDMIA:                tRt AM4ModeImm Pred-Imm Pred-CCR register_list
 static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert((Opcode == ARM::tLDM || Opcode == ARM::tLDM_UPD ||
-          Opcode == ARM::tSTM_UPD) && "Unexpected opcode");
-
-  unsigned &OpIdx = NumOpsAdded;
+                                     uint32_t insn, unsigned short NumOps,
+                                     unsigned &NumOpsAdded, BO B) {
+  assert((Opcode == ARM::tLDMIA || Opcode == ARM::tLDMIA_UPD ||
+          Opcode == ARM::tSTMIA_UPD) && "Unexpected opcode");
 
   unsigned tRt = getT1tRt(insn);
-
-  OpIdx = 0;
+  NumOpsAdded = 0;
 
   // WB register, if necessary.
-  if (Opcode == ARM::tLDM_UPD || Opcode == ARM::tSTM_UPD) {
+  if (Opcode == ARM::tLDMIA_UPD || Opcode == ARM::tSTMIA_UPD) {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        tRt)));
-    ++OpIdx;
+    ++NumOpsAdded;
   }
 
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                      tRt)));
-  ++OpIdx;
-
-  // A8.6.53 LDM / LDMIA / LDMFD - Encoding T1
-  // A8.6.53 STM / STMIA / STMEA - Encoding T1
-  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)));
-  ++OpIdx;
+  ++NumOpsAdded;
 
   // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-    OpIdx += 2;
-  else {
+  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) {
+    NumOpsAdded += 2;
+  } else {
     DEBUG(errs() << "Expected predicate operands not found.\n");
     return false;
   }
@@ -874,13 +897,12 @@ static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode,
   unsigned RegListBits = slice(insn, 7, 0);
 
   // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 8; ++i) {
+  for (unsigned i = 0; i < 8; ++i)
     if ((RegListBits >> i) & 1) {
       MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
                                                          i)));
-      ++OpIdx;
+      ++NumOpsAdded;
     }
-  }
 
   return true;
 }
@@ -959,22 +981,23 @@ static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn,
 // corresponding to op.
 //
 // Table A6-1 16-bit Thumb instruction encoding (abridged)
-// op		Instruction or instruction class
-// ------	--------------------------------------------------------------------
-// 00xxxx	Shift (immediate), add, subtract, move, and compare on page A6-7
-// 010000	Data-processing on page A6-8
-// 010001	Special data instructions and branch and exchange on page A6-9
-// 01001x	Load from Literal Pool, see LDR (literal) on page A8-122
-// 0101xx	Load/store single data item on page A6-10
+// op    Instruction or instruction class
+// ------  --------------------------------------------------------------------
+// 00xxxx  Shift (immediate), add, subtract, move, and compare on page A6-7
+// 010000  Data-processing on page A6-8
+// 010001  Special data instructions and branch and exchange on page A6-9
+// 01001x  Load from Literal Pool, see LDR (literal) on page A8-122
+// 0101xx  Load/store single data item on page A6-10
 // 011xxx
 // 100xxx
-// 10100x	Generate PC-relative address, see ADR on page A8-32
-// 10101x	Generate SP-relative address, see ADD (SP plus immediate) on page A8-28
-// 1011xx	Miscellaneous 16-bit instructions on page A6-11
-// 11000x	Store multiple registers, see STM / STMIA / STMEA on page A8-374
-// 11001x	Load multiple registers, see LDM / LDMIA / LDMFD on page A8-110 a
-// 1101xx	Conditional branch, and Supervisor Call on page A6-13
-// 11100x	Unconditional Branch, see B on page A8-44
+// 10100x  Generate PC-relative address, see ADR on page A8-32
+// 10101x  Generate SP-relative address, see ADD (SP plus immediate) on
+//         page A8-28
+// 1011xx  Miscellaneous 16-bit instructions on page A6-11
+// 11000x  Store multiple registers, see STM / STMIA / STMEA on page A8-374
+// 11001x  Load multiple registers, see LDM / LDMIA / LDMFD on page A8-110 a
+// 1101xx  Conditional branch, and Supervisor Call on page A6-13
+// 11100x  Unconditional Branch, see B on page A8-44
 //
 static bool DisassembleThumb1(uint16_t op, MCInst &MI, unsigned Opcode,
     uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
@@ -1121,34 +1144,31 @@ static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (Thumb2RFEOpcode(Opcode))
     return DisassembleThumb2RFE(MI, Opcode, insn, NumOps, NumOpsAdded, B);
 
-  assert((Opcode == ARM::t2LDM || Opcode == ARM::t2LDM_UPD ||
-          Opcode == ARM::t2STM || Opcode == ARM::t2STM_UPD)
+  assert((Opcode == ARM::t2LDMIA || Opcode == ARM::t2LDMIA_UPD ||
+          Opcode == ARM::t2LDMDB || Opcode == ARM::t2LDMDB_UPD ||
+          Opcode == ARM::t2STMIA || Opcode == ARM::t2STMIA_UPD ||
+          Opcode == ARM::t2STMDB || Opcode == ARM::t2STMDB_UPD)
          && "Unexpected opcode");
   assert(NumOps >= 5 && "Thumb2 LdStMul expects NumOps >= 5");
 
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
+  NumOpsAdded = 0;
 
   unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
 
   // Writeback to base.
-  if (Opcode == ARM::t2LDM_UPD || Opcode == ARM::t2STM_UPD) {
+  if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD ||
+      Opcode == ARM::t2STMIA_UPD || Opcode == ARM::t2STMDB_UPD) {
     MI.addOperand(MCOperand::CreateReg(Base));
-    ++OpIdx;
+    ++NumOpsAdded;
   }
 
   MI.addOperand(MCOperand::CreateReg(Base));
-  ++OpIdx;
-
-  ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn));
-  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode)));
-  ++OpIdx;
+  ++NumOpsAdded;
 
   // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-    OpIdx += 2;
-  else {
+  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) {
+    NumOpsAdded += 2;
+  } else {
     DEBUG(errs() << "Expected predicate operands not found.\n");
     return false;
   }
@@ -1156,13 +1176,12 @@ static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn,
   unsigned RegListBits = insn & ((1 << 16) - 1);
 
   // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 16; ++i) {
+  for (unsigned i = 0; i < 16; ++i)
     if ((RegListBits >> i) & 1) {
       MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                          i)));
-      ++OpIdx;
+      ++NumOpsAdded;
     }
-  }
 
   return true;
 }
@@ -1260,13 +1279,7 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode,
   return true;
 }
 
-// PC-based defined for Codegen, which do not get decoded by design:
-//
-// t2TBB, t2TBH: Rm immDontCare immDontCare
-//
-// Generic version defined for disassembly:
-//
-// t2TBBgen, t2TBHgen: Rn Rm Pred-Imm Pred-CCR
+// t2TBB, t2TBH: Rn Rm Pred-Imm Pred-CCR
 static bool DisassembleThumb2TB(MCInst &MI, unsigned Opcode,
     uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -1401,7 +1414,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
 //
 // Two register operands: Rs Rn ModImm
 // One register operands (Rs=0b1111 no explicit dest reg): Rn ModImm
-// One register operands (Rn=0b1111 no explicit src reg): Rs ModImm - {t2MOVi, t2MVNi}
+// One register operands (Rn=0b1111 no explicit src reg): Rs ModImm -
+// {t2MOVi, t2MVNi}
 //
 // ModImm = ThumbExpandImm(i:imm3:imm8)
 static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
@@ -1644,15 +1658,25 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
     break;
   }
 
-  // CPS has a singleton $opt operand that contains the following information:
-  // opt{4-0} = mode from Inst{4-0}
-  // opt{5} = changemode from Inst{8}
-  // opt{8-6} = AIF from Inst{7-5}
-  // opt{10-9} = imod from Inst{10-9} with 0b10 as enable and 0b11 as disable
-  if (Opcode == ARM::t2CPS) {
-    unsigned Option = slice(insn, 4, 0) | slice(insn, 8, 8) << 5 |
-      slice(insn, 7, 5) << 6 | slice(insn, 10, 9) << 9;
-    MI.addOperand(MCOperand::CreateImm(Option));
+  // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different
+  // opcodes which match the same real instruction. This is needed since there's
+  // no current handling of optional arguments. Fix here when a better handling
+  // of optional arguments is implemented.
+  if (Opcode == ARM::t2CPS3p) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5)));  // iflags
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));  // mode
+    NumOpsAdded = 3;
+    return true;
+  }
+  if (Opcode == ARM::t2CPS2p) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5)));  // iflags
+    NumOpsAdded = 2;
+    return true;
+  }
+  if (Opcode == ARM::t2CPS1p) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode
     NumOpsAdded = 1;
     return true;
   }
@@ -1678,11 +1702,13 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
     NumOpsAdded = 1;
     return true;
   }
-  // MSR and MSRsys take one GPR reg Rn, followed by the mask.
-  if (Opcode == ARM::t2MSR || Opcode == ARM::t2MSRsys || Opcode == ARM::t2BXJ) {
+  // MSR take a mask, followed by one GPR reg Rn. The mask contains the R Bit in
+  // bit 4, and the special register fields in bits 3-0.
+  if (Opcode == ARM::t2MSR) {
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 20, 20) << 4 /* R Bit */ |
+                                       slice(insn, 11, 8) /* Special Reg */));
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRn(insn))));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 8)));
     NumOpsAdded = 2;
     return true;
   }
@@ -1728,12 +1754,12 @@ static inline bool Thumb2PreloadOpcode(unsigned Opcode) {
   switch (Opcode) {
   default:
     return false;
-  case ARM::t2PLDi12:   case ARM::t2PLDi8:   case ARM::t2PLDpci:
-  case ARM::t2PLDr:     case ARM::t2PLDs:
-  case ARM::t2PLDWi12:  case ARM::t2PLDWi8:  case ARM::t2PLDWpci:
-  case ARM::t2PLDWr:    case ARM::t2PLDWs:
-  case ARM::t2PLIi12:   case ARM::t2PLIi8:   case ARM::t2PLIpci:
-  case ARM::t2PLIr:     case ARM::t2PLIs:
+  case ARM::t2PLDi12:   case ARM::t2PLDi8:
+  case ARM::t2PLDs:
+  case ARM::t2PLDWi12:  case ARM::t2PLDWi8:
+  case ARM::t2PLDWs:
+  case ARM::t2PLIi12:   case ARM::t2PLIi8:
+  case ARM::t2PLIs:
     return true;
   }
 }
@@ -1769,11 +1795,10 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
            && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
     int Offset = 0;
-    if (Opcode == ARM::t2PLDpci || Opcode == ARM::t2PLDWpci ||
-             Opcode == ARM::t2PLIpci) {
+    if (slice(insn, 19, 16) == 0xFF) {
       bool Negative = slice(insn, 23, 23) == 0;
       unsigned Imm12 = getImm12(insn);
-      Offset = Negative ? -1 - Imm12 : 1 * Imm12;      
+      Offset = Negative ? -1 - Imm12 : 1 * Imm12;
     } else if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 ||
                Opcode == ARM::t2PLIi8) {
       // A8.6.117 Encoding T2: add = FALSE
@@ -1795,37 +1820,6 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
-// A8.6.63 LDRB (literal)
-// A8.6.79 LDRSB (literal)
-// A8.6.75 LDRH (literal)
-// A8.6.83 LDRSH (literal)
-// A8.6.59 LDR (literal)
-//
-// These instrs calculate an address from the PC value and an immediate offset.
-// Rd Rn=PC (+/-)imm12 (+ if Inst{23} == 0b1)
-static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 2 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass < 0 &&
-         "Expect >= 2 operands, first as reg, and second as imm operand");
-
-  // Build the register operand, followed by the (+/-)imm12 immediate.
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  MI.addOperand(MCOperand::CreateImm(decodeImm12(insn)));
-
-  NumOpsAdded = 2;
-
-  return true;
-}
-
 // A6.3.10 Store single data item
 // A6.3.9 Load byte, memory hints
 // A6.3.8 Load halfword, memory hints
@@ -1835,13 +1829,15 @@ static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
 //
 // t2LDRi12:   Rd Rn (+)imm12
 // t2LDRi8:    Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2LDRs:     Rd Rn Rm ConstantShiftSpecifier (see also DisassembleThumb2DPSoReg)
+// t2LDRs:     Rd Rn Rm ConstantShiftSpecifier (see also
+//             DisassembleThumb2DPSoReg)
 // t2LDR_POST: Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
 // t2LDR_PRE:  Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
 //
 // t2STRi12:   Rd Rn (+)imm12
 // t2STRi8:    Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2STRs:     Rd Rn Rm ConstantShiftSpecifier (see also DisassembleThumb2DPSoReg)
+// t2STRs:     Rd Rn Rm ConstantShiftSpecifier (see also
+//             DisassembleThumb2DPSoReg)
 // t2STR_POST: Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
 // t2STR_PRE:  Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
 //
@@ -1862,7 +1858,6 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
   // See, for example, A6.3.7 Load word: Table A6-18 Load word.
   if (Load && Rn == 15)
     return DisassembleThumb2Ldpci(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
   const TargetInstrDesc &TID = ARMInsts[Opcode];
   const TargetOperandInfo *OpInfo = TID.OpInfo;
   unsigned &OpIdx = NumOpsAdded;
@@ -1909,7 +1904,7 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
     else
       Imm = decodeImm8(insn);
   }
-  
+
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                      R0)));
   ++OpIdx;
@@ -2081,25 +2076,29 @@ static bool DisassembleThumb2LongMul(MCInst &MI, unsigned Opcode, uint32_t insn,
 // corresponding to (op1, op2, op).
 //
 // Table A6-9 32-bit Thumb instruction encoding
-// op1	op2		op	Instruction class, see
-// ---	-------	--	------------------------------------------------------------
-// 01	00xx0xx	-	Load/store multiple on page A6-23
-// 		00xx1xx	-	Load/store dual, load/store exclusive, table branch on page A6-24
-// 		01xxxxx	-	Data-processing (shifted register) on page A6-31
-// 		1xxxxxx	-	Coprocessor instructions on page A6-40
-// 10	x0xxxxx	0	Data-processing (modified immediate) on page A6-15
-// 		x1xxxxx	0	Data-processing (plain binary immediate) on page A6-19
-// 		-		1	Branches and miscellaneous control on page A6-20
-// 11	000xxx0	-	Store single data item on page A6-30
-// 		001xxx0	-	Advanced SIMD element or structure load/store instructions on page A7-27
-// 		00xx001 -	Load byte, memory hints on page A6-28
-// 		00xx011	-	Load halfword, memory hints on page A6-26
-// 		00xx101	-	Load word on page A6-25
-// 		00xx111	-	UNDEFINED
-// 		010xxxx	-	Data-processing (register) on page A6-33
-// 		0110xxx	-	Multiply, multiply accumulate, and absolute difference on page A6-38
-// 		0111xxx	-	Long multiply, long multiply accumulate, and divide on page A6-39
-// 		1xxxxxx	-	Coprocessor instructions on page A6-40
+// op1  op2    op  Instruction class, see
+// ---  -------  --  -----------------------------------------------------------
+// 01  00xx0xx  -  Load/store multiple on page A6-23
+//     00xx1xx  -  Load/store dual, load/store exclusive, table branch on
+//                 page A6-24
+//     01xxxxx  -  Data-processing (shifted register) on page A6-31
+//     1xxxxxx  -  Coprocessor instructions on page A6-40
+// 10  x0xxxxx  0  Data-processing (modified immediate) on page A6-15
+//     x1xxxxx  0  Data-processing (plain binary immediate) on page A6-19
+//         -    1  Branches and miscellaneous control on page A6-20
+// 11  000xxx0  -  Store single data item on page A6-30
+//     001xxx0  -  Advanced SIMD element or structure load/store instructions
+//                 on page A7-27
+//     00xx001  - Load byte, memory hints on page A6-28
+//     00xx011  -  Load halfword, memory hints on page A6-26
+//     00xx101  -  Load word on page A6-25
+//     00xx111  -  UNDEFINED
+//     010xxxx  -  Data-processing (register) on page A6-33
+//     0110xxx  -  Multiply, multiply accumulate, and absolute difference on
+//                 page A6-38
+//     0111xxx  -  Long multiply, long multiply accumulate, and divide on
+//                 page A6-39
+//     1xxxxxx  -  Coprocessor instructions on page A6-40
 //
 static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op,
     MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps,
@@ -2130,7 +2129,7 @@ static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op,
         return DisassembleThumb2LdStDual(MI, Opcode, insn, NumOps, NumOpsAdded,
                                          B);
       }
-      if (Opcode == ARM::t2TBBgen || Opcode == ARM::t2TBHgen) {
+      if (Opcode == ARM::t2TBB || Opcode == ARM::t2TBH) {
         // Table branch.
         return DisassembleThumb2TB(MI, Opcode, insn, NumOps, NumOpsAdded, B);
       }
@@ -2175,7 +2174,8 @@ static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op,
         }
       } else {
         // Table A6-9 32-bit Thumb instruction encoding: Load byte|halfword|word
-        return DisassembleThumb2LdSt(true, MI,Opcode,insn,NumOps,NumOpsAdded, B);
+        return DisassembleThumb2LdSt(true, MI, Opcode, insn, NumOps,
+                                     NumOpsAdded, B);
       }
       break;
     case 1:
@@ -2229,7 +2229,7 @@ static bool DisassembleThumbFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   }
 
   // A6.3 32-bit Thumb instruction encoding
-  
+
   uint16_t op1 = slice(HalfWord, 12, 11);
   uint16_t op2 = slice(HalfWord, 10, 4);
   uint16_t op = slice(insn, 15, 15);
diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 8026e77..1499da0 100644
--- a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "asm-printer"
-#include "ARM.h" // FIXME: FACTOR ENUMS BETTER.
+#include "ARMBaseInfo.h"
 #include "ARMInstPrinter.h"
 #include "ARMAddressingModes.h"
 #include "llvm/MC/MCInst.h"
@@ -22,86 +22,20 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-// Include the auto-generated portion of the assembly writer.
-#define MachineInstr MCInst
-#define ARMAsmPrinter ARMInstPrinter  // FIXME: REMOVE.
+#define GET_INSTRUCTION_NAME
 #include "ARMGenAsmWriter.inc"
-#undef MachineInstr
-#undef ARMAsmPrinter
 
-static unsigned NextReg(unsigned Reg) {
-  switch (Reg) {
-  default:
-    assert(0 && "Unexpected register enum");
-
-  case ARM::D0:
-    return ARM::D1;
-  case ARM::D1:
-    return ARM::D2;
-  case ARM::D2:
-    return ARM::D3;
-  case ARM::D3:
-    return ARM::D4;
-  case ARM::D4:
-    return ARM::D5;
-  case ARM::D5:
-    return ARM::D6;
-  case ARM::D6:
-    return ARM::D7;
-  case ARM::D7:
-    return ARM::D8;
-  case ARM::D8:
-    return ARM::D9;
-  case ARM::D9:
-    return ARM::D10;
-  case ARM::D10:
-    return ARM::D11;
-  case ARM::D11:
-    return ARM::D12;
-  case ARM::D12:
-    return ARM::D13;
-  case ARM::D13:
-    return ARM::D14;
-  case ARM::D14:
-    return ARM::D15;
-  case ARM::D15:
-    return ARM::D16;
-  case ARM::D16:
-    return ARM::D17;
-  case ARM::D17:
-    return ARM::D18;
-  case ARM::D18:
-    return ARM::D19;
-  case ARM::D19:
-    return ARM::D20;
-  case ARM::D20:
-    return ARM::D21;
-  case ARM::D21:
-    return ARM::D22;
-  case ARM::D22:
-    return ARM::D23;
-  case ARM::D23:
-    return ARM::D24;
-  case ARM::D24:
-    return ARM::D25;
-  case ARM::D25:
-    return ARM::D26;
-  case ARM::D26:
-    return ARM::D27;
-  case ARM::D27:
-    return ARM::D28;
-  case ARM::D28:
-    return ARM::D29;
-  case ARM::D29:
-    return ARM::D30;
-  case ARM::D30:
-    return ARM::D31;
-  }
+StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return getInstructionName(Opcode);
 }
 
+
 void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+  unsigned Opcode = MI->getOpcode();
+
   // Check for MOVs and print canonical forms, instead.
-  if (MI->getOpcode() == ARM::MOVs) {
+  if (Opcode == ARM::MOVs) {
+    // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
     const MCOperand &MO2 = MI->getOperand(2);
@@ -129,118 +63,82 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
   }
 
   // A8.6.123 PUSH
-  if ((MI->getOpcode() == ARM::STM_UPD || MI->getOpcode() == ARM::t2STM_UPD) &&
+  if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
-    const MCOperand &MO1 = MI->getOperand(2);
-    if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::db) {
-      O << '\t' << "push";
-      printPredicateOperand(MI, 3, O);
-      O << '\t';
-      printRegisterList(MI, 5, O);
-      return;
-    }
+    O << '\t' << "push";
+    printPredicateOperand(MI, 2, O);
+    if (Opcode == ARM::t2STMDB_UPD)
+      O << ".w";
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    return;
   }
 
   // A8.6.122 POP
-  if ((MI->getOpcode() == ARM::LDM_UPD || MI->getOpcode() == ARM::t2LDM_UPD) &&
+  if ((Opcode == ARM::LDMIA_UPD || Opcode == ARM::t2LDMIA_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
-    const MCOperand &MO1 = MI->getOperand(2);
-    if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::ia) {
-      O << '\t' << "pop";
-      printPredicateOperand(MI, 3, O);
-      O << '\t';
-      printRegisterList(MI, 5, O);
-      return;
-    }
+    O << '\t' << "pop";
+    printPredicateOperand(MI, 2, O);
+    if (Opcode == ARM::t2LDMIA_UPD)
+      O << ".w";
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    return;
   }
 
   // A8.6.355 VPUSH
-  if ((MI->getOpcode() == ARM::VSTMS_UPD || MI->getOpcode() ==ARM::VSTMD_UPD) &&
+  if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
-    const MCOperand &MO1 = MI->getOperand(2);
-    if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::db) {
-      O << '\t' << "vpush";
-      printPredicateOperand(MI, 3, O);
-      O << '\t';
-      printRegisterList(MI, 5, O);
-      return;
-    }
+    O << '\t' << "vpush";
+    printPredicateOperand(MI, 2, O);
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    return;
   }
 
   // A8.6.354 VPOP
-  if ((MI->getOpcode() == ARM::VLDMS_UPD || MI->getOpcode() ==ARM::VLDMD_UPD) &&
+  if ((Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMDIA_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
-    const MCOperand &MO1 = MI->getOperand(2);
-    if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::ia) {
-      O << '\t' << "vpop";
-      printPredicateOperand(MI, 3, O);
-      O << '\t';
-      printRegisterList(MI, 5, O);
-      return;
-    }
+    O << '\t' << "vpop";
+    printPredicateOperand(MI, 2, O);
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    return;
   }
 
   printInstruction(MI, O);
- }
+}
 
 void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O, const char *Modifier) {
+                                  raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     unsigned Reg = Op.getReg();
-    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
-      O << '{' << getRegisterName(Reg) << ", "
-               << getRegisterName(NextReg(Reg)) << '}';
-#if 0
-      // FIXME: Breaks e.g. ARM/vmul.ll.
-      assert(0);
-      /*
-      unsigned DRegLo = TRI->getSubReg(Reg, ARM::dsub_0);
-      unsigned DRegHi = TRI->getSubReg(Reg, ARM::dsub_1);
-      O << '{'
-      << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
-      << '}';*/
-#endif
-    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
-      assert(0);
-      /*
-      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
-      unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
-                                               &ARM::DPR_VFP2RegClass);
-      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
-       */
-    } else {
-      O << getRegisterName(Reg);
-    }
+    O << getRegisterName(Reg);
   } else if (Op.isImm()) {
-    assert((Modifier && !strcmp(Modifier, "call")) ||
-           ((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"));
     O << '#' << Op.getImm();
   } else {
-    if (Modifier && Modifier[0] != 0 && strcmp(Modifier, "call") != 0)
-      llvm_unreachable("Unsupported modifier");
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << *Op.getExpr();
   }
 }
 
-static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
+static void printSOImm(raw_ostream &O, int64_t V, raw_ostream *CommentStream,
                        const MCAsmInfo *MAI) {
   // Break it up into two parts that make up a shifter immediate.
   V = ARM_AM::getSOImmVal(V);
   assert(V != -1 && "Not a valid so_imm value!");
-  
+
   unsigned Imm = ARM_AM::getSOImmValImm(V);
   unsigned Rot = ARM_AM::getSOImmValRot(V);
-  
+
   // Print low-level immediate formation info, per
   // A5.1.3: "Data-processing operands - Immediate".
   if (Rot) {
     O << "#" << Imm << ", " << Rot;
     // Pretty printed version.
-    if (VerboseAsm)
-      O << ' ' << MAI->getCommentString()
-      << ' ' << (int)ARM_AM::rotr32(Imm, Rot);
+    if (CommentStream)
+      *CommentStream << (int)ARM_AM::rotr32(Imm, Rot) << "\n";
   } else {
     O << "#" << Imm;
   }
@@ -253,15 +151,7 @@ void ARMInstPrinter::printSOImmOperand(const MCInst *MI, unsigned OpNum,
                                        raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   assert(MO.isImm() && "Not a valid so_imm value!");
-  printSOImm(O, MO.getImm(), VerboseAsm, &MAI);
-}
-
-/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
-/// followed by an 'orr' to materialize.
-void ARMInstPrinter::printSOImm2PartOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  // FIXME: REMOVE this method.
-  abort();
+  printSOImm(O, MO.getImm(), CommentStream, &MAI);
 }
 
 // so_reg is a 4-operand unit corresponding to register forms of the A5.1
@@ -274,9 +164,9 @@ void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
   const MCOperand &MO3 = MI->getOperand(OpNum+2);
-  
+
   O << getRegisterName(MO1.getReg());
-  
+
   // Print the shift opc.
   ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
   O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
@@ -294,14 +184,14 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
   const MCOperand &MO1 = MI->getOperand(Op);
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
-  
+
   if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
     printOperand(MI, Op, O);
     return;
   }
-  
+
   O << "[" << getRegisterName(MO1.getReg());
-  
+
   if (!MO2.getReg()) {
     if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0.
       O << ", #"
@@ -310,24 +200,24 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
     O << "]";
     return;
   }
-  
+
   O << ", "
     << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
     << getRegisterName(MO2.getReg());
-  
+
   if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
     O << ", "
     << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
     << " #" << ShImm;
   O << "]";
-}  
+}
 
 void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
                                                  unsigned OpNum,
                                                  raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
-  
+
   if (!MO1.getReg()) {
     unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
     O << '#'
@@ -335,10 +225,10 @@ void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
       << ImmOffs;
     return;
   }
-  
+
   O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
     << getRegisterName(MO1.getReg());
-  
+
   if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
     O << ", "
     << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
@@ -350,15 +240,15 @@ void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
   const MCOperand &MO3 = MI->getOperand(OpNum+2);
-  
+
   O << '[' << getRegisterName(MO1.getReg());
-  
+
   if (MO2.getReg()) {
     O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm())
       << getRegisterName(MO2.getReg()) << ']';
     return;
   }
-  
+
   if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
     O << ", #"
       << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
@@ -371,53 +261,42 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
                                                  raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
-  
+
   if (MO1.getReg()) {
     O << (char)ARM_AM::getAM3Op(MO2.getImm())
     << getRegisterName(MO1.getReg());
     return;
   }
-  
+
   unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
   O << '#'
     << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()))
     << ImmOffs;
 }
 
-
-void ARMInstPrinter::printAddrMode4Operand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O,
-                                           const char *Modifier) {
-  const MCOperand &MO2 = MI->getOperand(OpNum+1);
-  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
-  if (Modifier && strcmp(Modifier, "submode") == 0) {
-    O << ARM_AM::getAMSubModeStr(Mode);
-  } else if (Modifier && strcmp(Modifier, "wide") == 0) {
-    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
-    if (Mode == ARM_AM::ia)
-      O << ".w";
-  } else {
-    printOperand(MI, OpNum, O);
-  }
+void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(OpNum)
+                                                 .getImm());
+  O << ARM_AM::getAMSubModeStr(Mode);
 }
 
 void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O,
-                                           const char *Modifier) {
+                                           raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
-  
+
   if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
     printOperand(MI, OpNum, O);
     return;
   }
-  
+
   O << "[" << getRegisterName(MO1.getReg());
-  
+
   if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
     O << ", #"
       << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
-      << ImmOffs*4;
+      << ImmOffs * 4;
   }
   O << "]";
 }
@@ -426,7 +305,7 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
-  
+
   O << "[" << getRegisterName(MO1.getReg());
   if (MO2.getImm()) {
     // FIXME: Both darwin as and GNU as violate ARM docs here.
@@ -445,12 +324,6 @@ void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
     O << ", " << getRegisterName(MO.getReg());
 }
 
-void ARMInstPrinter::printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O,
-                                            const char *Modifier) {
-  assert(0 && "FIXME: Implement printAddrModePCOperand");
-}
-
 void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
                                                     unsigned OpNum,
                                                     raw_ostream &O) {
@@ -497,33 +370,41 @@ void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
   O << "}";
 }
 
-void ARMInstPrinter::printCPSOptionOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O) {
+void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNum);
-  unsigned option = Op.getImm();
-  unsigned mode = option & 31;
-  bool changemode = option >> 5 & 1;
-  unsigned AIF = option >> 6 & 7;
-  unsigned imod = option >> 9 & 3;
-  if (imod == 2)
-    O << "ie";
-  else if (imod == 3)
-    O << "id";
-  O << '\t';
-  if (imod > 1) {
-    if (AIF & 4) O << 'a';
-    if (AIF & 2) O << 'i';
-    if (AIF & 1) O << 'f';
-    if (AIF > 0 && changemode) O << ", ";
-  }
-  if (changemode)
-    O << '#' << mode;
+  if (Op.getImm())
+    O << "be";
+  else
+    O << "le";
+}
+
+void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  O << ARM_PROC::IModToString(Op.getImm());
+}
+
+void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  unsigned IFlags = Op.getImm();
+  for (int i=2; i >= 0; --i)
+    if (IFlags & (1 << i))
+      O << ARM_PROC::IFlagsToString(1 << i);
 }
 
 void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
                                          raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNum);
-  unsigned Mask = Op.getImm();
+  unsigned SpecRegRBit = Op.getImm() >> 4;
+  unsigned Mask = Op.getImm() & 0xf;
+
+  if (SpecRegRBit)
+    O << "spsr";
+  else
+    O << "cpsr";
+
   if (Mask) {
     O << '_';
     if (Mask & 8) O << 'f';
@@ -550,7 +431,7 @@ void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
     O << ARMCondCodeToString(CC);
 }
 
-void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, 
+void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI,
                                                     unsigned OpNum,
                                                     raw_ostream &O) {
   ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
@@ -566,25 +447,24 @@ void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
   }
 }
 
-
-
-void ARMInstPrinter::printCPInstOperand(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O,
-                                        const char *Modifier) {
-  // FIXME: remove this.
-  abort();
-}
-
 void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
   O << MI->getOperand(OpNum).getImm();
 }
 
+void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "p" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "c" << MI->getOperand(OpNum).getImm();
+}
 
 void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
-  // FIXME: remove this.
-  abort();
+  llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
 }
 
 void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
@@ -611,17 +491,25 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
 void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op,
                                                  raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
-  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
   O << "[" << getRegisterName(MO1.getReg());
-  O << ", " << getRegisterName(MO2.getReg()) << "]";
+  if (unsigned RegNum = MO2.getReg())
+    O << ", " << getRegisterName(RegNum);
+  O << "]";
 }
 
-void ARMInstPrinter::printThumbAddrModeRI5Operand(const MCInst *MI, unsigned Op,
-                                                  raw_ostream &O,
-                                                  unsigned Scale) {
+void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
+                                                    unsigned Op,
+                                                    raw_ostream &O,
+                                                    unsigned Scale) {
   const MCOperand &MO1 = MI->getOperand(Op);
-  const MCOperand &MO2 = MI->getOperand(Op+1);
-  const MCOperand &MO3 = MI->getOperand(Op+2);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
 
   if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
     printOperand(MI, Op, O);
@@ -629,44 +517,32 @@ void ARMInstPrinter::printThumbAddrModeRI5Operand(const MCInst *MI, unsigned Op,
   }
 
   O << "[" << getRegisterName(MO1.getReg());
-  if (MO3.getReg())
-    O << ", " << getRegisterName(MO3.getReg());
-  else if (unsigned ImmOffs = MO2.getImm())
+  if (unsigned ImmOffs = MO2.getImm())
     O << ", #" << ImmOffs * Scale;
   O << "]";
 }
 
-void ARMInstPrinter::printThumbAddrModeS1Operand(const MCInst *MI, unsigned Op,
-                                                 raw_ostream &O) {
-  printThumbAddrModeRI5Operand(MI, Op, O, 1);
+void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 1);
 }
 
-void ARMInstPrinter::printThumbAddrModeS2Operand(const MCInst *MI, unsigned Op,
-                                                 raw_ostream &O) {
-  printThumbAddrModeRI5Operand(MI, Op, O, 2);
+void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 2);
 }
 
-void ARMInstPrinter::printThumbAddrModeS4Operand(const MCInst *MI, unsigned Op,
-                                                 raw_ostream &O) {
-  printThumbAddrModeRI5Operand(MI, Op, O, 4);
+void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 4);
 }
 
 void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op,
                                                  raw_ostream &O) {
-  const MCOperand &MO1 = MI->getOperand(Op);
-  const MCOperand &MO2 = MI->getOperand(Op+1);
-  O << "[" << getRegisterName(MO1.getReg());
-  if (unsigned ImmOffs = MO2.getImm())
-    O << ", #" << ImmOffs*4;
-  O << "]";
-}
-
-void ARMInstPrinter::printTBAddrMode(const MCInst *MI, unsigned OpNum,
-                                     raw_ostream &O) {
-  O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg());
-  if (MI->getOpcode() == ARM::t2TBH)
-    O << ", lsl #1";
-  O << ']';
+  printThumbAddrModeImm5SOperand(MI, Op, O, 4);
 }
 
 // Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
@@ -689,16 +565,26 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
     O << " #" << ARM_AM::getSORegOffset(MO2.getImm());
 }
 
-void ARMInstPrinter::printT2AddrModeImm12Operand(const MCInst *MI,
-                                                 unsigned OpNum,
-                                                 raw_ostream &O) {
+void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
   O << "[" << getRegisterName(MO1.getReg());
 
-  unsigned OffImm = MO2.getImm();
-  if (OffImm)  // Don't print +0.
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub)
+    O << ", #-" << -OffImm;
+  else if (OffImm > 0)
     O << ", #" << OffImm;
   O << "]";
 }
@@ -783,12 +669,37 @@ void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
 
 void ARMInstPrinter::printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
-  O << '#' << MI->getOperand(OpNum).getImm();
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << '#';
+  if (MO.isFPImm()) {
+    O << (float)MO.getFPImm();
+  } else {
+    union {
+      uint32_t I;
+      float F;
+    } FPUnion;
+
+    FPUnion.I = MO.getImm();
+    O << FPUnion.F;
+  }
 }
 
 void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
-  O << '#' << MI->getOperand(OpNum).getImm();
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << '#';
+  if (MO.isFPImm()) {
+    O << MO.getFPImm();
+  } else {
+    // We expect the binary encoding of a floating point number here.
+    union {
+      uint64_t I;
+      double D;
+    } FPUnion;
+
+    FPUnion.I = MO.getImm();
+    O << FPUnion.D;
+  }
 }
 
 void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index e5ad0d0..679d313 100644
--- a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -18,26 +18,25 @@
 
 namespace llvm {
   class MCOperand;
-  
+
 class ARMInstPrinter : public MCInstPrinter {
-  bool VerboseAsm;
 public:
-  ARMInstPrinter(const MCAsmInfo &MAI, bool verboseAsm)
-    : MCInstPrinter(MAI), VerboseAsm(verboseAsm) {}
+  ARMInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {}
 
   virtual void printInst(const MCInst *MI, raw_ostream &O);
-  
+  virtual StringRef getOpcodeName(unsigned Opcode) const;
+
+  static const char *getInstructionName(unsigned Opcode);
+
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
 
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0);
-    
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
   void printSOImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSOImm2PartOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  
+
   void printSORegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum,
@@ -45,15 +44,11 @@ public:
   void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
-  void printAddrMode4Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                             const char *Modifier = 0);
-  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                             const char *Modifier = 0);
+  void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
-  void printAddrModePCOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                              const char *Modifier = 0);
 
   void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O);
@@ -64,20 +59,20 @@ public:
   void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
-  void printThumbAddrModeRI5Operand(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O, unsigned Scale);
-  void printThumbAddrModeS1Operand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
-  void printThumbAddrModeS2Operand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
-  void printThumbAddrModeS4Operand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
+  void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O, unsigned Scale);
+  void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
   void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
-  
+
   void printT2SOOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printT2AddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
+  void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
   void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O);
   void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
@@ -88,7 +83,10 @@ public:
                                           raw_ostream &O);
   void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
-  
+
+  void printSetendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printCPSOptionOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printNegZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
@@ -98,21 +96,16 @@ public:
   void printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
                                 raw_ostream &O);
   void printRegisterList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printCPInstOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                          const char *Modifier);
-  void printJTBlockOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {}
-  void printJT2BlockOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {}
-  void printTBAddrMode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);  
-  // FIXME: Implement.
-  void PrintSpecial(const MCInst *MI, raw_ostream &O, const char *Kind) {}
+  void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 };
-  
-}
+
+} // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..18645c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMAsmPrinter
+  ARMInstPrinter.cpp
+  )
+add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/Makefile b/contrib/llvm/lib/Target/ARM/InstPrinter/Makefile
new file mode 100644
index 0000000..65d372e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMAsmPrinter
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
new file mode 100644
index 0000000..f9e86eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -0,0 +1,321 @@
+//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ----------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
+// multiple and add / sub instructions) when special VMLx hazards are detected.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mlx-expansion"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<bool>
+ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
+
+STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
+
+namespace {
+  struct MLxExpansion : public MachineFunctionPass {
+    static char ID;
+    MLxExpansion() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM MLA / MLS expansion pass";
+    }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    unsigned MIIdx;
+    MachineInstr* LastMIs[4];
+
+    void clearStack();
+    void pushStack(MachineInstr *MI);
+    MachineInstr *getAccDefMI(MachineInstr *MI) const;
+    unsigned getDefReg(MachineInstr *MI) const;
+    bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
+    bool FindMLxHazard(MachineInstr *MI) const;
+    void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                unsigned MulOpc, unsigned AddSubOpc,
+                                bool NegAcc, bool HasLane);
+    bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
+  };
+  char MLxExpansion::ID = 0;
+}
+
+void MLxExpansion::clearStack() {
+  std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0);
+  MIIdx = 0;
+}
+
+void MLxExpansion::pushStack(MachineInstr *MI) {
+  LastMIs[MIIdx] = MI;
+  if (++MIIdx == 4)
+    MIIdx = 0;
+}
+
+MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
+  // Look past COPY and INSERT_SUBREG instructions to find the
+  // real definition MI. This is important for _sfp instructions.
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return 0;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+    if (DefMI->getParent() != MBB)
+      break;
+    if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+    break;
+  }
+  return DefMI;
+}
+
+unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+      !MRI->hasOneNonDBGUse(Reg))
+    return Reg;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg);
+  if (UseMI->getParent() != MBB)
+    return Reg;
+
+  while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
+    Reg = UseMI->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        !MRI->hasOneNonDBGUse(Reg))
+      return Reg;
+    UseMI = &*MRI->use_nodbg_begin(Reg);
+    if (UseMI->getParent() != MBB)
+      return Reg;
+  }
+
+  return Reg;
+}
+
+bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  // FIXME: Detect integer instructions properly.
+  unsigned Domain = TID.TSFlags & ARMII::DomainMask;
+  if (Domain == ARMII::DomainVFP) {
+    unsigned Opcode = TID.getOpcode();
+    if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
+        Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+      return false;
+  } else if (Domain == ARMII::DomainNEON) {
+    if (TID.mayStore() || TID.mayLoad())
+      return false;
+  } else {
+    return false;
+  }
+
+  return MI->readsRegister(Reg, TRI);
+  return false;
+}
+
+
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
+  if (NumExpand >= ExpandLimit)
+    return false;
+
+  if (ForceExapnd)
+    return true;
+
+  MachineInstr *DefMI = getAccDefMI(MI);
+  if (TII->isFpMLxInstruction(DefMI->getOpcode()))
+    // r0 = vmla
+    // r3 = vmla r0, r1, r2
+    // takes 16 - 17 cycles
+    //
+    // r0 = vmla
+    // r4 = vmul r1, r2
+    // r3 = vadd r0, r4
+    // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+    return true;
+
+  // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
+  // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
+  // preserves the in-order retirement of the instructions.
+  // Look at the next few instructions, if *most* of them can cause hazards,
+  // then the scheduler can't *fix* this, we'd better break up the VMLA.
+  for (unsigned i = 1; i <= 4; ++i) {
+    int Idx = ((int)MIIdx - i + 4) % 4;
+    MachineInstr *NextMI = LastMIs[Idx];
+    if (!NextMI)
+      continue;
+
+    if (TII->canCauseFpMLxStall(NextMI->getOpcode()))
+      return true;
+
+    // Look for VMLx RAW hazard.
+    if (hasRAWHazard(getDefReg(MI), NextMI))
+      return true;
+  }
+
+  return false;
+}
+
+/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
+/// of MUL + ADD / SUB instructions.
+void
+MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                     unsigned MulOpc, unsigned AddSubOpc,
+                                     bool NegAcc, bool HasLane) {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  bool DstDead = MI->getOperand(0).isDead();
+  unsigned AccReg = MI->getOperand(1).getReg();
+  unsigned Src1Reg = MI->getOperand(2).getReg();
+  unsigned Src2Reg = MI->getOperand(3).getReg();
+  bool Src1Kill = MI->getOperand(2).isKill();
+  bool Src2Kill = MI->getOperand(3).isKill();
+  unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
+  unsigned NextOp = HasLane ? 5 : 4;
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
+  unsigned PredReg = MI->getOperand(++NextOp).getReg();
+
+  const TargetInstrDesc &TID1 = TII->get(MulOpc);
+  const TargetInstrDesc &TID2 = TII->get(AddSubOpc);
+  unsigned TmpReg = MRI->createVirtualRegister(TID1.getRegClass(0, TRI));
+
+  MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID1, TmpReg)
+    .addReg(Src1Reg, getKillRegState(Src1Kill))
+    .addReg(Src2Reg, getKillRegState(Src2Kill));
+  if (HasLane)
+    MIB.addImm(LaneImm);
+  MIB.addImm(Pred).addReg(PredReg);
+
+  MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID2)
+    .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
+
+  if (NegAcc) {
+    bool AccKill = MRI->hasOneNonDBGUse(AccReg);
+    MIB.addReg(TmpReg, getKillRegState(true))
+       .addReg(AccReg, getKillRegState(AccKill));
+  } else {
+    MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
+  }
+  MIB.addImm(Pred).addReg(PredReg);
+
+  DEBUG({
+      dbgs() << "Expanding: " << *MI;
+      dbgs() << "  to:\n";
+      MachineBasicBlock::iterator MII = MI;
+      MII = llvm::prior(MII);
+      MachineInstr &MI2 = *MII;
+      MII = llvm::prior(MII);
+      MachineInstr &MI1 = *MII;
+      dbgs() << "    " << MI1;
+      dbgs() << "    " << MI2;
+   });
+
+  MI->eraseFromParent();
+  ++NumExpand;
+}
+
+bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  clearStack();
+
+  unsigned Skip = 0;
+  MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
+  while (MII != E) {
+    MachineInstr *MI = &*MII;
+
+    if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) {
+      ++MII;
+      continue;
+    }
+
+    const TargetInstrDesc &TID = MI->getDesc();
+    if (TID.isBarrier()) {
+      clearStack();
+      Skip = 0;
+      ++MII;
+      continue;
+    }
+
+    unsigned Domain = TID.TSFlags & ARMII::DomainMask;
+    if (Domain == ARMII::DomainGeneral) {
+      if (++Skip == 2)
+        // Assume dual issues of non-VFP / NEON instructions.
+        pushStack(0);
+    } else {
+      Skip = 0;
+
+      unsigned MulOpc, AddSubOpc;
+      bool NegAcc, HasLane;
+      if (!TII->isFpMLxInstruction(TID.getOpcode(),
+                                   MulOpc, AddSubOpc, NegAcc, HasLane) ||
+          !FindMLxHazard(MI))
+        pushStack(MI);
+      else {
+        ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
+        E = MBB.rend(); // May have changed if MI was the 1st instruction.
+        Changed = true;
+        continue;
+      }
+    }
+
+    ++MII;
+  }
+
+  return Changed;
+}
+
+bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
+  TRI = Fn.getTarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= ExpandFPMLxInstructions(MBB);
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createMLxExpansionPass() {
+  return new MLxExpansion();
+}
diff --git a/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp b/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp
deleted file mode 100644
index 3407ac6..0000000
--- a/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp
+++ /dev/null
@@ -1,406 +0,0 @@
-//===-- NEONPreAllocPass.cpp - Allocate adjacent NEON registers--*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "neon-prealloc"
-#include "ARM.h"
-#include "ARMInstrInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-using namespace llvm;
-
-namespace {
-  class NEONPreAllocPass : public MachineFunctionPass {
-    const TargetInstrInfo *TII;
-    MachineRegisterInfo *MRI;
-
-  public:
-    static char ID;
-    NEONPreAllocPass() : MachineFunctionPass(ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    virtual const char *getPassName() const {
-      return "NEON register pre-allocation pass";
-    }
-
-  private:
-    bool FormsRegSequence(MachineInstr *MI,
-                          unsigned FirstOpnd, unsigned NumRegs,
-                          unsigned Offset, unsigned Stride) const;
-    bool PreAllocNEONRegisters(MachineBasicBlock &MBB);
-  };
-
-  char NEONPreAllocPass::ID = 0;
-}
-
-static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
-                             unsigned &Offset, unsigned &Stride) {
-  // Default to unit stride with no offset.
-  Stride = 1;
-  Offset = 0;
-
-  switch (Opcode) {
-  default:
-    break;
-
-  case ARM::VLD2LNd8:
-  case ARM::VLD2LNd16:
-  case ARM::VLD2LNd32:
-    FirstOpnd = 0;
-    NumRegs = 2;
-    return true;
-
-  case ARM::VLD2LNq16:
-  case ARM::VLD2LNq32:
-    FirstOpnd = 0;
-    NumRegs = 2;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VLD2LNq16odd:
-  case ARM::VLD2LNq32odd:
-    FirstOpnd = 0;
-    NumRegs = 2;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
-  case ARM::VLD3LNd8:
-  case ARM::VLD3LNd16:
-  case ARM::VLD3LNd32:
-    FirstOpnd = 0;
-    NumRegs = 3;
-    return true;
-
-  case ARM::VLD3LNq16:
-  case ARM::VLD3LNq32:
-    FirstOpnd = 0;
-    NumRegs = 3;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VLD3LNq16odd:
-  case ARM::VLD3LNq32odd:
-    FirstOpnd = 0;
-    NumRegs = 3;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
-  case ARM::VLD4LNd8:
-  case ARM::VLD4LNd16:
-  case ARM::VLD4LNd32:
-    FirstOpnd = 0;
-    NumRegs = 4;
-    return true;
-
-  case ARM::VLD4LNq16:
-  case ARM::VLD4LNq32:
-    FirstOpnd = 0;
-    NumRegs = 4;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VLD4LNq16odd:
-  case ARM::VLD4LNq32odd:
-    FirstOpnd = 0;
-    NumRegs = 4;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
-  case ARM::VST2LNd8:
-  case ARM::VST2LNd16:
-  case ARM::VST2LNd32:
-    FirstOpnd = 2;
-    NumRegs = 2;
-    return true;
-
-  case ARM::VST2LNq16:
-  case ARM::VST2LNq32:
-    FirstOpnd = 2;
-    NumRegs = 2;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VST2LNq16odd:
-  case ARM::VST2LNq32odd:
-    FirstOpnd = 2;
-    NumRegs = 2;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
-  case ARM::VST3LNd8:
-  case ARM::VST3LNd16:
-  case ARM::VST3LNd32:
-    FirstOpnd = 2;
-    NumRegs = 3;
-    return true;
-
-  case ARM::VST3LNq16:
-  case ARM::VST3LNq32:
-    FirstOpnd = 2;
-    NumRegs = 3;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VST3LNq16odd:
-  case ARM::VST3LNq32odd:
-    FirstOpnd = 2;
-    NumRegs = 3;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
-  case ARM::VST4LNd8:
-  case ARM::VST4LNd16:
-  case ARM::VST4LNd32:
-    FirstOpnd = 2;
-    NumRegs = 4;
-    return true;
-
-  case ARM::VST4LNq16:
-  case ARM::VST4LNq32:
-    FirstOpnd = 2;
-    NumRegs = 4;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VST4LNq16odd:
-  case ARM::VST4LNq32odd:
-    FirstOpnd = 2;
-    NumRegs = 4;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
-  case ARM::VTBL2:
-    FirstOpnd = 1;
-    NumRegs = 2;
-    return true;
-
-  case ARM::VTBL3:
-    FirstOpnd = 1;
-    NumRegs = 3;
-    return true;
-
-  case ARM::VTBL4:
-    FirstOpnd = 1;
-    NumRegs = 4;
-    return true;
-
-  case ARM::VTBX2:
-    FirstOpnd = 2;
-    NumRegs = 2;
-    return true;
-
-  case ARM::VTBX3:
-    FirstOpnd = 2;
-    NumRegs = 3;
-    return true;
-
-  case ARM::VTBX4:
-    FirstOpnd = 2;
-    NumRegs = 4;
-    return true;
-  }
-
-  return false;
-}
-
-bool
-NEONPreAllocPass::FormsRegSequence(MachineInstr *MI,
-                                   unsigned FirstOpnd, unsigned NumRegs,
-                                   unsigned Offset, unsigned Stride) const {
-  MachineOperand &FMO = MI->getOperand(FirstOpnd);
-  assert(FMO.isReg() && FMO.getSubReg() == 0 && "unexpected operand");
-  unsigned VirtReg = FMO.getReg();
-  (void)VirtReg;
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-         "expected a virtual register");
-
-  unsigned LastSubIdx = 0;
-  if (FMO.isDef()) {
-    MachineInstr *RegSeq = 0;
-    for (unsigned R = 0; R < NumRegs; ++R) {
-      const MachineOperand &MO = MI->getOperand(FirstOpnd + R);
-      assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
-      unsigned VirtReg = MO.getReg();
-      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-             "expected a virtual register");
-      // Feeding into a REG_SEQUENCE.
-      if (!MRI->hasOneNonDBGUse(VirtReg))
-        return false;
-      MachineInstr *UseMI = &*MRI->use_nodbg_begin(VirtReg);
-      if (!UseMI->isRegSequence())
-        return false;
-      if (RegSeq && RegSeq != UseMI)
-        return false;
-      unsigned OpIdx = 1 + (Offset + R * Stride) * 2;
-      if (UseMI->getOperand(OpIdx).getReg() != VirtReg)
-        llvm_unreachable("Malformed REG_SEQUENCE instruction!");
-      unsigned SubIdx = UseMI->getOperand(OpIdx + 1).getImm();
-      if (LastSubIdx) {
-        if (LastSubIdx != SubIdx-Stride)
-          return false;
-      } else {
-        // Must start from dsub_0 or qsub_0.
-        if (SubIdx != (ARM::dsub_0+Offset) &&
-            SubIdx != (ARM::qsub_0+Offset))
-          return false;
-      }
-      RegSeq = UseMI;
-      LastSubIdx = SubIdx;
-    }
-
-    // In the case of vld3, etc., make sure the trailing operand of
-    // REG_SEQUENCE is an undef.
-    if (NumRegs == 3) {
-      unsigned OpIdx = 1 + (Offset + 3 * Stride) * 2;
-      const MachineOperand &MO = RegSeq->getOperand(OpIdx);
-      unsigned VirtReg = MO.getReg();
-      MachineInstr *DefMI = MRI->getVRegDef(VirtReg);
-      if (!DefMI || !DefMI->isImplicitDef())
-        return false;
-    }
-    return true;
-  }
-
-  unsigned LastSrcReg = 0;
-  SmallVector<unsigned, 4> SubIds;
-  for (unsigned R = 0; R < NumRegs; ++R) {
-    const MachineOperand &MO = MI->getOperand(FirstOpnd + R);
-    assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
-    unsigned VirtReg = MO.getReg();
-    assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-           "expected a virtual register");
-    // Extracting from a Q or QQ register.
-    MachineInstr *DefMI = MRI->getVRegDef(VirtReg);
-    if (!DefMI || !DefMI->isCopy() || !DefMI->getOperand(1).getSubReg())
-      return false;
-    VirtReg = DefMI->getOperand(1).getReg();
-    if (LastSrcReg && LastSrcReg != VirtReg)
-      return false;
-    LastSrcReg = VirtReg;
-    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
-    if (RC != ARM::QPRRegisterClass &&
-        RC != ARM::QQPRRegisterClass &&
-        RC != ARM::QQQQPRRegisterClass)
-      return false;
-    unsigned SubIdx = DefMI->getOperand(1).getSubReg();
-    if (LastSubIdx) {
-      if (LastSubIdx != SubIdx-Stride)
-        return false;
-    } else {
-      // Must start from dsub_0 or qsub_0.
-      if (SubIdx != (ARM::dsub_0+Offset) &&
-          SubIdx != (ARM::qsub_0+Offset))
-        return false;
-    }
-    SubIds.push_back(SubIdx);
-    LastSubIdx = SubIdx;
-  }
-
-  // FIXME: Update the uses of EXTRACT_SUBREG from REG_SEQUENCE is
-  // currently required for correctness. e.g.
-  //  %reg1041<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6
-  //  %reg1042<def> = EXTRACT_SUBREG %reg1041, 6
-  //  %reg1043<def> = EXTRACT_SUBREG %reg1041, 5
-  //  VST1q16 %reg1025<kill>, 0, %reg1043<kill>, %reg1042<kill>,
-  // reg1042 and reg1043 should be replaced with reg1041:6 and reg1041:5
-  // respectively.
-  // We need to change how we model uses of REG_SEQUENCE.
-  for (unsigned R = 0; R < NumRegs; ++R) {
-    MachineOperand &MO = MI->getOperand(FirstOpnd + R);
-    unsigned OldReg = MO.getReg();
-    MachineInstr *DefMI = MRI->getVRegDef(OldReg);
-    assert(DefMI->isCopy());
-    MO.setReg(LastSrcReg);
-    MO.setSubReg(SubIds[R]);
-    MO.setIsKill(false);
-    // Delete the EXTRACT_SUBREG if its result is now dead.
-    if (MRI->use_empty(OldReg))
-      DefMI->eraseFromParent();
-  }
-
-  return true;
-}
-
-bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
-  bool Modified = false;
-
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  for (; MBBI != E; ++MBBI) {
-    MachineInstr *MI = &*MBBI;
-    unsigned FirstOpnd, NumRegs, Offset, Stride;
-    if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride))
-      continue;
-    if (FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride))
-      continue;
-
-    MachineBasicBlock::iterator NextI = llvm::next(MBBI);
-    for (unsigned R = 0; R < NumRegs; ++R) {
-      MachineOperand &MO = MI->getOperand(FirstOpnd + R);
-      assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
-      unsigned VirtReg = MO.getReg();
-      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-             "expected a virtual register");
-
-      // For now, just assign a fixed set of adjacent registers.
-      // This leaves plenty of room for future improvements.
-      static const unsigned NEONDRegs[] = {
-        ARM::D0, ARM::D1, ARM::D2, ARM::D3,
-        ARM::D4, ARM::D5, ARM::D6, ARM::D7
-      };
-      MO.setReg(NEONDRegs[Offset + R * Stride]);
-
-      if (MO.isUse()) {
-        // Insert a copy from VirtReg.
-        BuildMI(MBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),MO.getReg())
-          .addReg(VirtReg, getKillRegState(MO.isKill()));
-        MO.setIsKill();
-      } else if (MO.isDef() && !MO.isDead()) {
-        // Add a copy to VirtReg.
-        BuildMI(MBB, NextI, DebugLoc(), TII->get(TargetOpcode::COPY), VirtReg)
-          .addReg(MO.getReg());
-      }
-    }
-  }
-
-  return Modified;
-}
-
-bool NEONPreAllocPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = MF.getTarget().getInstrInfo();
-  MRI = &MF.getRegInfo();
-
-  bool Modified = false;
-  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
-    Modified |= PreAllocNEONRegisters(MBB);
-  }
-
-  return Modified;
-}
-
-/// createNEONPreAllocPass - returns an instance of the NEON register
-/// pre-allocation pass.
-FunctionPass *llvm::createNEONPreAllocPass() {
-  return new NEONPreAllocPass();
-}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
new file mode 100644
index 0000000..233e165
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -0,0 +1,352 @@
+//======- Thumb1FrameLowering.cpp - Thumb1 Frame Information ---*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb1 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1FrameLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static void emitSPUpdate(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         const TargetInstrInfo &TII, DebugLoc dl,
+                         const Thumb1RegisterInfo &MRI,
+                         int NumBytes) {
+  emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI, dl);
+}
+
+void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned BasePtr = RegInfo->getBaseRegister();
+
+  // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+  NumBytes = (NumBytes + 3) & ~3;
+  MFI->setStackSize(NumBytes);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  if (VARegSaveSize)
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetDarwin()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+    ++MBBI;
+    if (MBBI != MBB.end())
+      dl = MBBI->getDebugLoc();
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  NumBytes = DPRCSOffset;
+
+  // Adjust FP so it point to the stack slot that contains the previous FP.
+  if (hasFP(MF)) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0);
+    if (NumBytes > 7)
+      // If offset is > 7 then sp cannot be adjusted in a single instruction,
+      // try restoring from fp instead.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  if (NumBytes)
+    // Insert it after all the callee-save spills.
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes);
+
+  if (STI.isTargetELF() && hasFP(MF))
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr).addReg(ARM::SP);
+    
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI->hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
+  if (MI->getOpcode() == ARM::tRestore &&
+      MI->getOperand(1).isFI() &&
+      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
+    return true;
+  else if (MI->getOpcode() == ARM::tPOP) {
+    // The first two operands are predicates. The last two are
+    // imp-def and imp-use of SP. Check everything in between.
+    for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i)
+      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+        return false;
+    return true;
+  }
+  return false;
+}
+
+void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert((MBBI->getOpcode() == ARM::tBX_RET ||
+          MBBI->getOpcode() == ARM::tPOP_RET) &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+  const unsigned *CSRegs = RegInfo->getCalleeSavedRegs();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
+      if (!isCSRestore(MBBI, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      // Reset SP based on frame pointer only if the stack frame extends beyond
+      // frame pointer stack slot, the target is ELF and the function has FP, or
+      // the target uses var sized objects.
+      if (NumBytes) {
+        assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+               "No scratch register to restore SP from FP!");
+        emitThumbRegPlusImmediate(MBB, MBBI, ARM::R4, FramePtr, -NumBytes,
+                                  TII, *RegInfo, dl);
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
+          .addReg(ARM::R4);
+      } else
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
+          .addReg(FramePtr);
+    } else {
+      if (MBBI->getOpcode() == ARM::tBX_RET &&
+          &MBB.front() != MBBI &&
+          prior(MBBI)->getOpcode() == ARM::tPOP) {
+        MachineBasicBlock::iterator PMBBI = prior(MBBI);
+        emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+      } else
+        emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+    }
+  }
+
+  if (VARegSaveSize) {
+    // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+    // to LR, and we can't pop the value directly to the PC since
+    // we need to update the SP after popping the value. Therefore, we
+    // pop the old LR into R3 as a temporary.
+
+    // Move back past the callee-saved register restoration
+    while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs))
+      ++MBBI;
+    // Epilogue for vararg functions: pop LR to R3 and branch off it.
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+      .addReg(ARM::R3, RegState::Define);
+
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, VARegSaveSize);
+
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
+      .addReg(ARM::R3, RegState::Kill);
+    // erase the old tBX_RET instruction
+    MBB.erase(MBBI);
+  }
+}
+
+bool Thumb1FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
+  AddDefaultPred(MIB);
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    bool isKill = true;
+
+    // Add the callee-saved register as live-in unless it's LR and
+    // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress
+    // then it's already added to the function and entry block live-in sets.
+    if (Reg == ARM::LR) {
+      MachineFunction &MF = *MBB.getParent();
+      if (MF.getFrameInfo()->isReturnAddressTaken() &&
+          MF.getRegInfo().isLiveIn(Reg))
+        isKill = false;
+    }
+
+    if (isKill)
+      MBB.addLiveIn(Reg);
+
+    MIB.addReg(Reg, getKillRegState(isKill));
+  }
+  return true;
+}
+
+bool Thumb1FrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
+  AddDefaultPred(MIB);
+
+  bool NumRegs = false;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (Reg == ARM::LR) {
+      // Special epilogue for vararg functions. See emitEpilogue
+      if (isVarArg)
+        continue;
+      Reg = ARM::PC;
+      (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+      MI = MBB.erase(MI);
+    }
+    MIB.addReg(Reg, getDefRegState(true));
+    NumRegs = true;
+  }
+
+  // It's illegal to emit pop instruction without operands.
+  if (NumRegs)
+    MBB.insert(MI, &*MIB);
+  else
+    MF.DeleteMachineInstr(MIB);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
new file mode 100644
index 0000000..c592e12
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -0,0 +1,52 @@
+//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __THUMB_FRAMEINFO_H_
+#define __THUMM_FRAMEINFO_H_
+
+#include "ARM.h"
+#include "ARMFrameLowering.h"
+#include "ARMSubtarget.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb1RegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class Thumb1FrameLowering : public ARMFrameLowering {
+public:
+  explicit Thumb1FrameLowering(const ARMSubtarget &sti)
+    : ARMFrameLowering(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index af630ac..3fbb433 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -71,8 +71,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
-                              MachineMemOperand::MOStore, 0,
+      MF.getMachineMemOperand(
+                    MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                              MachineMemOperand::MOStore,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSpill))
@@ -99,85 +100,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
-                              MachineMemOperand::MOLoad, 0,
+      MF.getMachineMemOperand(
+                    MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                              MachineMemOperand::MOLoad,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
   }
 }
-
-bool Thumb1InstrInfo::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          const std::vector<CalleeSavedInfo> &CSI,
-                          const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
-
-  DebugLoc DL;
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
-
-  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH));
-  AddDefaultPred(MIB);
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
-    bool isKill = true;
-
-    // Add the callee-saved register as live-in unless it's LR and
-    // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress
-    // then it's already added to the function and entry block live-in sets.
-    if (Reg == ARM::LR) {
-      MachineFunction &MF = *MBB.getParent();
-      if (MF.getFrameInfo()->isReturnAddressTaken() &&
-          MF.getRegInfo().isLiveIn(Reg))
-        isKill = false;
-    }
-
-    if (isKill)
-      MBB.addLiveIn(Reg);
-
-    MIB.addReg(Reg, getKillRegState(isKill));
-  }
-  return true;
-}
-
-bool Thumb1InstrInfo::
-restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            const std::vector<CalleeSavedInfo> &CSI,
-                            const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  if (CSI.empty())
-    return false;
-
-  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
-  DebugLoc DL = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::tPOP));
-  AddDefaultPred(MIB);
-
-  bool NumRegs = false;
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
-    if (Reg == ARM::LR) {
-      // Special epilogue for vararg functions. See emitEpilogue
-      if (isVarArg)
-        continue;
-      Reg = ARM::PC;
-      (*MIB).setDesc(get(ARM::tPOP_RET));
-      MI = MBB.erase(MI);
-    }
-    MIB.addReg(Reg, getDefRegState(true));
-    NumRegs = true;
-  }
-
-  // It's illegal to emit pop instruction without operands.
-  if (NumRegs)
-    MBB.insert(MI, &*MIB);
-  else
-    MF.DeleteMachineInstr(MIB);
-
-  return true;
-}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 555135a..17ef2f7 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -37,28 +37,19 @@ public:
   ///
   const Thumb1RegisterInfo &getRegisterInfo() const { return RI; }
 
-  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
-
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
 
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp
index a21a3da..f62a13e 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -29,7 +29,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -63,24 +63,11 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
-  BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRcp))
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
           .addReg(DestReg, getDefRegState(true), SubIdx)
           .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
 }
 
-bool Thumb1RegisterInfo::hasReservedCallFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  unsigned CFSize = FFI->getMaxCallFrameSize();
-  // It's not always a good idea to include the call frame as part of the
-  // stack frame. ARM (especially Thumb) has small immediate offset to
-  // address the stack frame. So a large call frame can cause poor codegen
-  // and may even makes it impossible to scavenge a register.
-  if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
-    return false;
-
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
 
 /// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
 /// a destreg = basereg + immediate in Thumb code. Materialize the immediate
@@ -92,7 +79,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
                               unsigned DestReg, unsigned BaseReg,
                               int NumBytes, bool CanChangeCC,
                               const TargetInstrInfo &TII,
-                              const Thumb1RegisterInfo& MRI,
+                              const ARMBaseRegisterInfo& MRI,
                               DebugLoc dl) {
     MachineFunction &MF = *MBB.getParent();
     bool isHigh = !isARMLowRegister(DestReg) ||
@@ -162,13 +149,12 @@ static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
 
 /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
 /// a destreg = basereg + immediate in Thumb code.
-static
-void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI,
-                               unsigned DestReg, unsigned BaseReg,
-                               int NumBytes, const TargetInstrInfo &TII,
-                               const Thumb1RegisterInfo& MRI,
-                               DebugLoc dl) {
+void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     unsigned DestReg, unsigned BaseReg,
+                                     int NumBytes, const TargetInstrInfo &TII,
+                                     const ARMBaseRegisterInfo& MRI,
+                                     DebugLoc dl) {
   bool isSub = NumBytes < 0;
   unsigned Bytes = (unsigned)NumBytes;
   if (isSub) Bytes = -NumBytes;
@@ -304,7 +290,9 @@ static void emitSPUpdate(MachineBasicBlock &MBB,
 void Thumb1RegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  if (!hasReservedCallFrame(MF)) {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (!TFI->hasReservedCallFrame(MF)) {
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
     // ADJCALLSTACKUP   -> add, sp, sp, amount
@@ -315,7 +303,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      unsigned Align = TFI->getStackAlignment();
       Amount = (Amount+Align-1)/Align*Align;
 
       // Replace the pseudo instruction with a new instruction...
@@ -363,6 +351,22 @@ static void removeOperands(MachineInstr &MI, unsigned i) {
     MI.RemoveOperand(Op);
 }
 
+/// convertToNonSPOpcode - Change the opcode to the non-SP version, because
+/// we're replacing the frame index with a non-SP register.
+static unsigned convertToNonSPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::tLDRspi:
+  case ARM::tRestore:           // FIXME: Should this opcode be here?
+    return ARM::tLDRi;
+
+  case ARM::tSTRspi:
+  case ARM::tSpill:             // FIXME: Should this opcode be here?
+    return ARM::tSTRi;
+  }
+
+  return Opcode;
+}
+
 bool Thumb1RegisterInfo::
 rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
                   unsigned FrameReg, int &Offset,
@@ -464,55 +468,51 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
     }
     return true;
   } else {
-    unsigned ImmIdx = 0;
-    int InstrOffs = 0;
-    unsigned NumBits = 0;
-    unsigned Scale = 1;
-    switch (AddrMode) {
-    case ARMII::AddrModeT1_s: {
-      ImmIdx = FrameRegIdx+1;
-      InstrOffs = MI.getOperand(ImmIdx).getImm();
-      NumBits = (FrameReg == ARM::SP) ? 8 : 5;
-      Scale = 4;
-      break;
-    }
-    default:
+    if (AddrMode != ARMII::AddrModeT1_s)
       llvm_unreachable("Unsupported addressing mode!");
-      break;
-    }
+
+    unsigned ImmIdx = FrameRegIdx + 1;
+    int InstrOffs = MI.getOperand(ImmIdx).getImm();
+    unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+    unsigned Scale = 4;
 
     Offset += InstrOffs * Scale;
-    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!");
 
     // Common case: small offset, fits into instruction.
     MachineOperand &ImmOp = MI.getOperand(ImmIdx);
     int ImmedOffset = Offset / Scale;
     unsigned Mask = (1 << NumBits) - 1;
+
     if ((unsigned)Offset <= Mask * Scale) {
-      // Replace the FrameIndex with sp
+      // Replace the FrameIndex with the frame register (e.g., sp).
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       ImmOp.ChangeToImmediate(ImmedOffset);
+
+      // If we're using a register where sp was stored, convert the instruction
+      // to the non-SP version.
+      unsigned NewOpc = convertToNonSPOpcode(Opcode);
+      if (NewOpc != Opcode && FrameReg != ARM::SP)
+        MI.setDesc(TII.get(NewOpc));
+
       return true;
     }
 
-    bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
-    if (AddrMode == ARMII::AddrModeT1_s) {
-      // Thumb tLDRspi, tSTRspi. These will change to instructions that use
-      // a different base register.
-      NumBits = 5;
-      Mask = (1 << NumBits) - 1;
-    }
+    NumBits = 5;
+    Mask = (1 << NumBits) - 1;
+
     // If this is a thumb spill / restore, we will be using a constpool load to
     // materialize the offset.
-    if (AddrMode == ARMII::AddrModeT1_s && isThumSpillRestore)
+    if (Opcode == ARM::tRestore || Opcode == ARM::tSpill) {
       ImmOp.ChangeToImmediate(0);
-    else {
+    } else {
       // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
       ImmedOffset = ImmedOffset & Mask;
       ImmOp.ChangeToImmediate(ImmedOffset);
-      Offset &= ~(Mask*Scale);
+      Offset &= ~(Mask * Scale);
     }
   }
+
   return Offset == 0;
 }
 
@@ -602,7 +602,8 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
     Offset -= AFI->getGPRCalleeSavedArea2Offset();
   else if (MF.getFrameInfo()->hasVarSizedObjects()) {
-    assert(SPAdj == 0 && hasFP(MF) && "Unexpected");
+    assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) &&
+           "Unexpected");
     // There are alloca()'s in this function, must reference off the frame
     // pointer or base pointer instead.
     if (!hasBasePointer(MF)) {
@@ -655,13 +656,12 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                 *this, dl);
     }
 
-    MI.setDesc(TII.get(ARM::tLDR));
+    MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
     MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
     if (UseRR)
-      // Use [reg, reg] addrmode.
-      MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
-    else  // tLDR has an extra register operand.
-      MI.addOperand(MachineOperand::CreateReg(0, false));
+      // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+      // register. The offset is already handled in the vreg value.
+      MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false);
   } else if (Desc.mayStore()) {
       VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
       bool UseRR = false;
@@ -677,14 +677,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       } else
         emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII,
                                   *this, dl);
-      MI.setDesc(TII.get(ARM::tSTR));
+      MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
       MI.getOperand(i).ChangeToRegister(VReg, false, false, true);
-      if (UseRR)  // Use [reg, reg] addrmode.
-        MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
-      else // tSTR has an extra register operand.
-        MI.addOperand(MachineOperand::CreateReg(0, false));
-  } else
+      if (UseRR)
+        // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+        // register. The offset is already handled in the vreg value.
+        MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false);
+  } else {
     assert(false && "Unexpected opcode!");
+  }
 
   // Add predicate back if it's needed.
   if (MI.getDesc().isPredicable()) {
@@ -692,206 +693,3 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     AddDefaultPred(MIB);
   }
 }
-
-void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineFrameInfo  *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  unsigned NumBytes = MFI->getStackSize();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
-
-  // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
-  NumBytes = (NumBytes + 3) & ~3;
-  MFI->setStackSize(NumBytes);
-
-  // Determine the sizes of each callee-save spill areas and record which frame
-  // belongs to which callee-save spill areas.
-  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
-  int FramePtrSpillFI = 0;
-
-  if (VARegSaveSize)
-    emitSPUpdate(MBB, MBBI, TII, dl, *this, -VARegSaveSize);
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
-    return;
-  }
-
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    int FI = CSI[i].getFrameIdx();
-    switch (Reg) {
-    case ARM::R4:
-    case ARM::R5:
-    case ARM::R6:
-    case ARM::R7:
-    case ARM::LR:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
-      GPRCS1Size += 4;
-      break;
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      if (STI.isTargetDarwin()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
-        GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
-        GPRCS1Size += 4;
-      }
-      break;
-    default:
-      AFI->addDPRCalleeSavedAreaFrame(FI);
-      DPRCSSize += 8;
-    }
-  }
-
-  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
-    ++MBBI;
-    if (MBBI != MBB.end())
-      dl = MBBI->getDebugLoc();
-  }
-
-  // Adjust FP so it point to the stack slot that contains the previous FP.
-  if (hasFP(MF)) {
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0);
-    AFI->setShouldRestoreSPFromFP(true);
-  }
-
-  // Determine starting offsets of spill areas.
-  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
-  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
-  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
-  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
-  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
-  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
-
-  NumBytes = DPRCSOffset;
-  if (NumBytes) {
-    // Insert it after all the callee-save spills.
-    emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
-  }
-
-  if (STI.isTargetELF() && hasFP(MF))
-    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
-                             AFI->getFramePtrSpillOffset());
-
-  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
-  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
-  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
-
-  // If we need a base pointer, set it up here. It's whatever the value
-  // of the stack pointer is at this point. Any variable size objects
-  // will be allocated after this, so we can still use the base pointer
-  // to reference locals.
-  if (hasBasePointer(MF))
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr).addReg(ARM::SP);
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
-  if (MI->getOpcode() == ARM::tRestore &&
-      MI->getOperand(1).isFI() &&
-      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
-    return true;
-  else if (MI->getOpcode() == ARM::tPOP) {
-    // The first two operands are predicates. The last two are
-    // imp-def and imp-use of SP. Check everything in between.
-    for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i)
-      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
-        return false;
-    return true;
-  }
-  return false;
-}
-
-void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
-                                      MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = prior(MBB.end());
-  assert((MBBI->getOpcode() == ARM::tBX_RET ||
-          MBBI->getOpcode() == ARM::tPOP_RET) &&
-         "Can only insert epilog into returning blocks");
-  DebugLoc dl = MBBI->getDebugLoc();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
-  int NumBytes = (int)MFI->getStackSize();
-  const unsigned *CSRegs = getCalleeSavedRegs();
-
-  if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
-  } else {
-    // Unwind MBBI to point to first LDR / VLDRD.
-    if (MBBI != MBB.begin()) {
-      do
-        --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
-      if (!isCSRestore(MBBI, CSRegs))
-        ++MBBI;
-    }
-
-    // Move SP to start of FP callee save spill area.
-    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
-                 AFI->getGPRCalleeSavedArea2Size() +
-                 AFI->getDPRCalleeSavedAreaSize());
-
-    if (AFI->shouldRestoreSPFromFP()) {
-      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
-      // Reset SP based on frame pointer only if the stack frame extends beyond
-      // frame pointer stack slot or target is ELF and the function has FP.
-      if (NumBytes)
-        emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes,
-                                  TII, *this, dl);
-      else
-        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
-          .addReg(FramePtr);
-    } else {
-      if (MBBI->getOpcode() == ARM::tBX_RET &&
-          &MBB.front() != MBBI &&
-          prior(MBBI)->getOpcode() == ARM::tPOP) {
-        MachineBasicBlock::iterator PMBBI = prior(MBBI);
-        emitSPUpdate(MBB, PMBBI, TII, dl, *this, NumBytes);
-      } else
-        emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
-    }
-  }
-
-  if (VARegSaveSize) {
-    // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
-    // to LR, and we can't pop the value directly to the PC since
-    // we need to update the SP after popping the value. Therefore, we
-    // pop the old LR into R3 as a temporary.
-
-    // Move back past the callee-saved register restoration
-    while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs))
-      ++MBBI;
-    // Epilogue for vararg functions: pop LR to R3 and branch off it.
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
-      .addReg(ARM::R3, RegState::Define);
-
-    emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize);
-
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
-      .addReg(ARM::R3, RegState::Kill);
-    // erase the old tBX_RET instruction
-    MBB.erase(MBBI);
-  }
-}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h
index c578054..8a87cc5 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -38,8 +38,6 @@ public:
                         unsigned PredReg = 0) const;
 
   /// Code Generation virtual methods...
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
-
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
@@ -59,9 +57,6 @@ public:
                              unsigned Reg) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, RegScavenger *RS = NULL) const;
-
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 };
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp
deleted file mode 100644
index 172908d..0000000
--- a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM.h"
-#include "Thumb2HazardRecognizer.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-using namespace llvm;
-
-ScheduleHazardRecognizer::HazardType
-Thumb2HazardRecognizer::getHazardType(SUnit *SU) {
-  if (ITBlockSize) {
-    MachineInstr *MI = SU->getInstr();
-    if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1])
-      return Hazard;
-  }
-
-  return PostRAHazardRecognizer::getHazardType(SU);
-}
-
-void Thumb2HazardRecognizer::Reset() {
-  ITBlockSize = 0;
-  PostRAHazardRecognizer::Reset();
-}
-
-void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) {
-  MachineInstr *MI = SU->getInstr();
-  unsigned Opcode = MI->getOpcode();
-  if (ITBlockSize) {
-    --ITBlockSize;
-  } else if (Opcode == ARM::t2IT) {
-    unsigned Mask = MI->getOperand(1).getImm();
-    unsigned NumTZ = CountTrailingZeros_32(Mask);
-    assert(NumTZ <= 3 && "Invalid IT mask!");
-    ITBlockSize = 4 - NumTZ;
-    MachineBasicBlock::iterator I = MI;
-    for (unsigned i = 0; i < ITBlockSize; ++i) {
-      // Advance to the next instruction, skipping any dbg_value instructions.
-      do {
-        ++I;
-      } while (I->isDebugValue());
-      ITBlockMIs[ITBlockSize-1-i] = &*I;
-    }
-  }
-
-  PostRAHazardRecognizer::EmitInstruction(SU);
-}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h
deleted file mode 100644
index 4726658..0000000
--- a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines hazard recognizers for scheduling Thumb2 functions on
-// ARM processors.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef THUMB2HAZARDRECOGNIZER_H
-#define THUMB2HAZARDRECOGNIZER_H
-
-#include "llvm/CodeGen/PostRAHazardRecognizer.h"
-
-namespace llvm {
-
-class MachineInstr;
-
-class Thumb2HazardRecognizer : public PostRAHazardRecognizer {
-  unsigned ITBlockSize;  // No. of MIs in current IT block yet to be scheduled.
-  MachineInstr *ITBlockMIs[4];
-
-public:
-  Thumb2HazardRecognizer(const InstrItineraryData &ItinData) :
-    PostRAHazardRecognizer(ItinData) {}
-
-  virtual HazardType getHazardType(SUnit *SU);
-  virtual void Reset();
-  virtual void EmitInstruction(SUnit *SU);
-};
-
-
-} // end namespace llvm
-
-#endif // THUMB2HAZARDRECOGNIZER_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 442f41d..2f67257 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -17,7 +17,6 @@
 #include "ARMAddressingModes.h"
 #include "ARMGenInstrInfo.inc"
 #include "ARMMachineFunctionInfo.h"
-#include "Thumb2HazardRecognizer.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -28,15 +27,10 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned>
-IfCvtLimit("thumb2-ifcvt-limit", cl::Hidden,
-           cl::desc("Thumb2 if-conversion limit (default 3)"),
-           cl::init(3));
-
-static cl::opt<unsigned>
-IfCvtDiamondLimit("thumb2-ifcvt-diamond-limit", cl::Hidden,
-                  cl::desc("Thumb2 diamond if-conversion limit (default 3)"),
-                  cl::init(3));
+static cl::opt<bool>
+OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
+           cl::desc("Use old-style Thumb2 if-conversion heuristics"),
+           cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
   : ARMBaseInstrInfo(STI), RI(*this, STI) {
@@ -105,21 +99,6 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
   return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
 }
 
-bool Thumb2InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                          unsigned NumInstrs) const {
-  return NumInstrs && NumInstrs <= IfCvtLimit;
-}
-  
-bool Thumb2InstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
-                    MachineBasicBlock &FMBB, unsigned NumF) const {
-  // FIXME: Catch optimization such as:
-  //        r0 = movne
-  //        r0 = moveq
-  return NumT && NumF &&
-    NumT <= (IfCvtDiamondLimit) && NumF <= (IfCvtDiamondLimit);
-}
-
 void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I, DebugLoc DL,
                                   unsigned DestReg, unsigned SrcReg,
@@ -155,8 +134,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
-                              MachineMemOperand::MOStore, 0,
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                              MachineMemOperand::MOStore,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
@@ -181,8 +161,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
-                              MachineMemOperand::MOLoad, 0,
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                              MachineMemOperand::MOLoad,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
@@ -193,11 +174,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
 }
 
-ScheduleHazardRecognizer *Thumb2InstrInfo::
-CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const {
-  return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II);
-}
-
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 3a9f8b1..f2637d7 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -38,11 +38,6 @@ public:
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI) const;
 
-  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const;
-  
-  bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs,
-                           MachineBasicBlock &FMBB, unsigned NumFInstrs) const;
-
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
@@ -70,9 +65,6 @@ public:
   /// always be able to get register info as well (through this method).
   ///
   const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
-
-  ScheduleHazardRecognizer *
-  CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const;
 };
 
 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 07dd0be..099b8f7 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -29,7 +29,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index 0c3962d..cc8f61c 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -58,7 +58,7 @@ namespace {
     { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  0,0, 0 },
     { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,    1,   0,  0,1, 0 },
     // Note: immediate scale is 4.
-    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 0 },
+    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 1 },
     { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  2,2, 1 },
     { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,    1,   0,  2,0, 1 },
     { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,    0,   1,  0,0, 0 },
@@ -68,9 +68,7 @@ namespace {
     //FIXME: Disable CMN, as CCodes are backwards from compare expectations
     //{ ARM::t2CMNrr, ARM::tCMN,    0,             0,   0,    1,   0,  2,0, 0 },
     { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0 },
-    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0 },
-    { ARM::t2CMPzri,ARM::tCMPzi8, 0,             8,   0,    1,   0,  2,0, 0 },
-    { ARM::t2CMPzrr,ARM::tCMPzhir,0,             0,   0,    0,   0,  2,0, 0 },
+    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 1 },
     { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 0 },
     // FIXME: adr.n immediate offset must be multiple of 4.
     //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,     0,   0,    1,   0,  1,0, 0 },
@@ -106,26 +104,27 @@ namespace {
 
     // FIXME: Clean this up after splitting each Thumb load / store opcode
     // into multiple ones.
-    { ARM::t2LDRi12,ARM::tLDR,    ARM::tLDRspi,  5,   8,    1,   0,  0,0, 1 },
-    { ARM::t2LDRs,  ARM::tLDR,    0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRBi12,ARM::tLDRB,  0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRBs, ARM::tLDRB,   0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRHi12,ARM::tLDRH,  0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRHs, ARM::tLDRH,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,    1,   0,  0,0, 1 },
+    { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,    1,   0,  0,0, 1 },
     { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,    1,   0,  0,0, 1 },
     { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRi12,ARM::tSTR,    ARM::tSTRspi,  5,   8,    1,   0,  0,0, 1 },
-    { ARM::t2STRs,  ARM::tSTR,    0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRBi12,ARM::tSTRB,  0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRBs, ARM::tSTRB,   0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRHi12,ARM::tSTRH,  0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRHs, ARM::tSTRH,   0,             0,   0,    1,   0,  0,0, 1 },
-
-    { ARM::t2LDM,   ARM::tLDM,    0,             0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2LDM_RET,0,           ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2LDM_UPD,ARM::tLDM_UPD,ARM::tPOP,    0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,    1,   0,  0,0, 1 },
+    { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,    1,   0,  0,0, 1 },
+
+    { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,    1,   1,  1,1, 1 },
     // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
-    { ARM::t2STM_UPD,ARM::tSTM_UPD,ARM::tPUSH,   0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,    1,   1,  1,1, 1 },
   };
 
   class Thumb2SizeReduce : public MachineFunctionPass {
@@ -217,8 +216,8 @@ Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
     /// Old opcode has an optional def of CPSR.
     if (HasCC)
       return true;
-    // If both old opcode does not implicit CPSR def, then it's not ok since
-    // these new opcodes CPSR def is not meant to be thrown away. e.g. CMP.
+    // If old opcode does not implicitly define CPSR, then it's not ok since
+    // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP.
     if (!HasImplicitCPSRDef(MI->getDesc()))
       return false;
     HasCC = true;
@@ -233,9 +232,10 @@ Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
 
 static bool VerifyLowRegs(MachineInstr *MI) {
   unsigned Opc = MI->getOpcode();
-  bool isPCOk = (Opc == ARM::t2LDM_RET || Opc == ARM::t2LDM ||
-                 Opc == ARM::t2LDM_UPD);
-  bool isLROk = (Opc == ARM::t2STM_UPD);
+  bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA     ||
+                 Opc == ARM::t2LDMDB     || Opc == ARM::t2LDMIA_UPD ||
+                 Opc == ARM::t2LDMDB_UPD);
+  bool isLROk = (Opc == ARM::t2STMIA_UPD || Opc == ARM::t2STMDB_UPD);
   bool isSPOk = isPCOk || isLROk || (Opc == ARM::t2ADDrSPi);
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
@@ -275,29 +275,32 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   unsigned Opc = Entry.NarrowOpc1;
   unsigned OpNum = 3; // First 'rest' of operands.
   uint8_t  ImmLimit = Entry.Imm1Limit;
+
   switch (Entry.WideOpc) {
   default:
     llvm_unreachable("Unexpected Thumb2 load / store opcode!");
   case ARM::t2LDRi12:
-  case ARM::t2STRi12: {
-    unsigned BaseReg = MI->getOperand(1).getReg();
-    if (BaseReg == ARM::SP) {
+  case ARM::t2STRi12:
+    if (MI->getOperand(1).getReg() == ARM::SP) {
       Opc = Entry.NarrowOpc2;
       ImmLimit = Entry.Imm2Limit;
       HasOffReg = false;
     }
+
     Scale = 4;
     HasImmOffset = true;
+    HasOffReg = false;
     break;
-  }
   case ARM::t2LDRBi12:
   case ARM::t2STRBi12:
     HasImmOffset = true;
+    HasOffReg = false;
     break;
   case ARM::t2LDRHi12:
   case ARM::t2STRHi12:
     Scale = 2;
     HasImmOffset = true;
+    HasOffReg = false;
     break;
   case ARM::t2LDRs:
   case ARM::t2LDRBs:
@@ -310,11 +313,12 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     HasShift = true;
     OpNum = 4;
     break;
-  case ARM::t2LDM: {
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB: {
     unsigned BaseReg = MI->getOperand(0).getReg();
-    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
-    if (!isARMLowRegister(BaseReg) || Mode != ARM_AM::ia)
+    if (!isARMLowRegister(BaseReg) || Entry.WideOpc != ARM::t2LDMIA)
       return false;
+
     // For the non-writeback version (this one), the base register must be
     // one of the registers being loaded.
     bool isOK = false;
@@ -324,6 +328,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
         break;
       }
     }
+
     if (!isOK)
       return false;
 
@@ -331,28 +336,33 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     isLdStMul = true;
     break;
   }
-  case ARM::t2LDM_RET: {
+  case ARM::t2LDMIA_RET: {
     unsigned BaseReg = MI->getOperand(1).getReg();
     if (BaseReg != ARM::SP)
       return false;
     Opc = Entry.NarrowOpc2; // tPOP_RET
-    OpNum = 3;
+    OpNum = 2;
     isLdStMul = true;
     break;
   }
-  case ARM::t2LDM_UPD:
-  case ARM::t2STM_UPD: {
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
     OpNum = 0;
+
     unsigned BaseReg = MI->getOperand(1).getReg();
-    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(2).getImm());
     if (BaseReg == ARM::SP &&
-        ((Entry.WideOpc == ARM::t2LDM_UPD && Mode == ARM_AM::ia) ||
-         (Entry.WideOpc == ARM::t2STM_UPD && Mode == ARM_AM::db))) {
+        (Entry.WideOpc == ARM::t2LDMIA_UPD ||
+         Entry.WideOpc == ARM::t2STMDB_UPD)) {
       Opc = Entry.NarrowOpc2; // tPOP or tPUSH
-      OpNum = 3;
-    } else if (!isARMLowRegister(BaseReg) || Mode != ARM_AM::ia) {
+      OpNum = 2;
+    } else if (!isARMLowRegister(BaseReg) ||
+               (Entry.WideOpc != ARM::t2LDMIA_UPD &&
+                Entry.WideOpc != ARM::t2STMIA_UPD)) {
       return false;
     }
+
     isLdStMul = true;
     break;
   }
@@ -363,6 +373,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   if (HasShift) {
     OffsetReg  = MI->getOperand(2).getReg();
     OffsetKill = MI->getOperand(2).isKill();
+
     if (MI->getOperand(3).getImm())
       // Thumb1 addressing mode doesn't support shift.
       return false;
@@ -372,23 +383,22 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
   if (HasImmOffset) {
     OffsetImm = MI->getOperand(2).getImm();
     unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale;
-    if ((OffsetImm & (Scale-1)) || OffsetImm > MaxOffset)
+
+    if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset)
       // Make sure the immediate field fits.
       return false;
   }
 
   // Add the 16-bit load / store instruction.
-  // FIXME: Thumb1 addressing mode encode both immediate and register offset.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc));
   if (!isLdStMul) {
-    MIB.addOperand(MI->getOperand(0)).addOperand(MI->getOperand(1));
-    if (Opc != ARM::tLDRSB && Opc != ARM::tLDRSH) {
-      // tLDRSB and tLDRSH do not have an immediate offset field. On the other
-      // hand, it must have an offset register.
-      // FIXME: Remove this special case.
-      MIB.addImm(OffsetImm/Scale);
-    }
+    MIB.addOperand(MI->getOperand(0));
+    MIB.addOperand(MI->getOperand(1));
+
+    if (HasImmOffset)
+      MIB.addImm(OffsetImm / Scale);
+
     assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
 
     if (HasOffReg)
@@ -423,7 +433,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
   default: break;
-  case ARM::t2ADDSri: 
+  case ARM::t2ADDSri:
   case ARM::t2ADDSrr: {
     unsigned PredReg = 0;
     if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
@@ -451,6 +461,25 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     if (MI->getOperand(1).isImm())
       return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
     break;
+  case ARM::t2CMPrr: {
+    // Try to reduce to the lo-reg only version first. Why there are two
+    // versions of the instruction is a mystery.
+    // It would be nice to just have two entries in the master table that
+    // are prioritized, but the table assumes a unique entry for each
+    // source insn opcode. So for now, we hack a local entry record to use.
+    static const ReduceEntry NarrowEntry =
+      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 1 };
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR))
+      return true;
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+  }
+  case ARM::t2ADDrSPi: {
+    static const ReduceEntry NarrowEntry =
+      { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 1 };
+    if (MI->getOperand(0).getReg() == ARM::SP)
+      return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR);
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+  }
   }
   return false;
 }